hololens commited on
Commit
e04dce3
·
verified ·
1 Parent(s): 6dd2b58

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +232 -0
  2. .gitignore +3 -0
  3. CHANGELOG.md +131 -0
  4. LICENSE +21 -0
  5. README.md +244 -12
  6. __init__.py +0 -0
  7. bundled_sources.txt +25 -0
  8. ddepth_anything_v2/DA-2K.md +51 -0
  9. ddepth_anything_v2/LICENSE +201 -0
  10. ddepth_anything_v2/README.md +201 -0
  11. ddepth_anything_v2/__init__.py +1 -0
  12. ddepth_anything_v2/__pycache__/__init__.cpython-310.pyc +0 -0
  13. ddepth_anything_v2/__pycache__/__init__.cpython-311.pyc +0 -0
  14. ddepth_anything_v2/__pycache__/__init__.cpython-312.pyc +0 -0
  15. ddepth_anything_v2/app.py +88 -0
  16. ddepth_anything_v2/depth_anything_v2/__pycache__/dinov2.cpython-310.pyc +0 -0
  17. ddepth_anything_v2/depth_anything_v2/__pycache__/dinov2.cpython-311.pyc +0 -0
  18. ddepth_anything_v2/depth_anything_v2/__pycache__/dinov2.cpython-312.pyc +0 -0
  19. ddepth_anything_v2/depth_anything_v2/__pycache__/dpt.cpython-310.pyc +0 -0
  20. ddepth_anything_v2/depth_anything_v2/__pycache__/dpt.cpython-311.pyc +0 -0
  21. ddepth_anything_v2/depth_anything_v2/__pycache__/dpt.cpython-312.pyc +0 -0
  22. ddepth_anything_v2/depth_anything_v2/dinov2.py +415 -0
  23. ddepth_anything_v2/depth_anything_v2/dinov2_layers/__init__.py +11 -0
  24. ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/__init__.cpython-310.pyc +0 -0
  25. ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/__init__.cpython-311.pyc +0 -0
  26. ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/__init__.cpython-312.pyc +0 -0
  27. ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/attention.cpython-310.pyc +0 -0
  28. ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/attention.cpython-311.pyc +0 -0
  29. ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/attention.cpython-312.pyc +0 -0
  30. ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/block.cpython-310.pyc +0 -0
  31. ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/block.cpython-311.pyc +0 -0
  32. ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/block.cpython-312.pyc +0 -0
  33. ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/drop_path.cpython-310.pyc +0 -0
  34. ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/drop_path.cpython-311.pyc +0 -0
  35. ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/drop_path.cpython-312.pyc +0 -0
  36. ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/layer_scale.cpython-310.pyc +0 -0
  37. ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/layer_scale.cpython-311.pyc +0 -0
  38. ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/layer_scale.cpython-312.pyc +0 -0
  39. ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/mlp.cpython-310.pyc +0 -0
  40. ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/mlp.cpython-311.pyc +0 -0
  41. ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/mlp.cpython-312.pyc +0 -0
  42. ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/patch_embed.cpython-310.pyc +0 -0
  43. ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/patch_embed.cpython-311.pyc +0 -0
  44. ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/patch_embed.cpython-312.pyc +0 -0
  45. ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/swiglu_ffn.cpython-310.pyc +0 -0
  46. ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/swiglu_ffn.cpython-311.pyc +0 -0
  47. ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/swiglu_ffn.cpython-312.pyc +0 -0
  48. ddepth_anything_v2/depth_anything_v2/dinov2_layers/attention.py +83 -0
  49. ddepth_anything_v2/depth_anything_v2/dinov2_layers/block.py +252 -0
  50. ddepth_anything_v2/depth_anything_v2/dinov2_layers/drop_path.py +35 -0
.gitattributes CHANGED
@@ -33,3 +33,235 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ examples.png filter=lfs diff=lfs merge=lfs -text
37
+ outputs/depthmap-17278951300005-left-right.png filter=lfs diff=lfs merge=lfs -text
38
+ outputs/depthmap-17278951300006-left-right.png filter=lfs diff=lfs merge=lfs -text
39
+ outputs/depthmap-17278951300007-left-right.png filter=lfs diff=lfs merge=lfs -text
40
+ outputs/depthmap-17278951300008-left-right.png filter=lfs diff=lfs merge=lfs -text
41
+ outputs/depthmap-17278951300009-left-right.png filter=lfs diff=lfs merge=lfs -text
42
+ outputs/depthmap-17278951300010-left-right.png filter=lfs diff=lfs merge=lfs -text
43
+ outputs/depthmap-17278951300011-left-right.png filter=lfs diff=lfs merge=lfs -text
44
+ outputs/depthmap-17278951300012-left-right.png filter=lfs diff=lfs merge=lfs -text
45
+ outputs/depthmap-17278951300013-left-right.png filter=lfs diff=lfs merge=lfs -text
46
+ outputs/depthmap-17278951300014-left-right.png filter=lfs diff=lfs merge=lfs -text
47
+ outputs/depthmap-17278951300015-left-right.png filter=lfs diff=lfs merge=lfs -text
48
+ outputs/depthmap-17278951300016-left-right.png filter=lfs diff=lfs merge=lfs -text
49
+ outputs/depthmap-17278951300017-left-right.png filter=lfs diff=lfs merge=lfs -text
50
+ outputs/depthmap-17278951300018-left-right.png filter=lfs diff=lfs merge=lfs -text
51
+ outputs/depthmap-17278951300019-left-right.png filter=lfs diff=lfs merge=lfs -text
52
+ outputs/depthmap-17278951300020-left-right.png filter=lfs diff=lfs merge=lfs -text
53
+ outputs/depthmap-17278951300021-left-right.png filter=lfs diff=lfs merge=lfs -text
54
+ outputs/depthmap-17278951300022-left-right.png filter=lfs diff=lfs merge=lfs -text
55
+ outputs/depthmap-17278951300023-left-right.png filter=lfs diff=lfs merge=lfs -text
56
+ outputs/depthmap-17278951300024-left-right.png filter=lfs diff=lfs merge=lfs -text
57
+ outputs/depthmap-17278951300025-left-right.png filter=lfs diff=lfs merge=lfs -text
58
+ outputs/depthmap-17278951300026-left-right.png filter=lfs diff=lfs merge=lfs -text
59
+ outputs/depthmap-17278951300027-left-right.png filter=lfs diff=lfs merge=lfs -text
60
+ outputs/depthmap-17278951300028-left-right.png filter=lfs diff=lfs merge=lfs -text
61
+ outputs/depthmap-17278951300029-left-right.png filter=lfs diff=lfs merge=lfs -text
62
+ outputs/depthmap-17278951300030-left-right.png filter=lfs diff=lfs merge=lfs -text
63
+ outputs/depthmap-17278951300031-left-right.png filter=lfs diff=lfs merge=lfs -text
64
+ outputs/depthmap-17278951300033-left-right.png filter=lfs diff=lfs merge=lfs -text
65
+ outputs/depthmap-17278951300034.png filter=lfs diff=lfs merge=lfs -text
66
+ outputs/depthmap-17278951300035-left-right.png filter=lfs diff=lfs merge=lfs -text
67
+ outputs/depthmap-17278951300036.png filter=lfs diff=lfs merge=lfs -text
68
+ outputs/depthmap-17278951300037-left-right.png filter=lfs diff=lfs merge=lfs -text
69
+ outputs/depthmap-17278951300039-left-right.png filter=lfs diff=lfs merge=lfs -text
70
+ outputs/depthmap-17278951300040.png filter=lfs diff=lfs merge=lfs -text
71
+ outputs/depthmap-17278951300041-left-right.png filter=lfs diff=lfs merge=lfs -text
72
+ outputs/depthmap-17278951300043-left-right.png filter=lfs diff=lfs merge=lfs -text
73
+ outputs/depthmap-17278951300045-left-right.png filter=lfs diff=lfs merge=lfs -text
74
+ outputs/depthmap-17278951300053-simple.obj filter=lfs diff=lfs merge=lfs -text
75
+ outputs/depthmap-17278951300056-simple.obj filter=lfs diff=lfs merge=lfs -text
76
+ outputs/depthmap-17278951300061-simple.obj filter=lfs diff=lfs merge=lfs -text
77
+ outputs/depthmap-17280589390003-left-right.png filter=lfs diff=lfs merge=lfs -text
78
+ outputs/depthmap-17280589390008.png filter=lfs diff=lfs merge=lfs -text
79
+ outputs/depthmap-17280589390009-left-right.png filter=lfs diff=lfs merge=lfs -text
80
+ outputs/depthmap-17280589390010.png filter=lfs diff=lfs merge=lfs -text
81
+ outputs/depthmap-17280589390011-left-right.png filter=lfs diff=lfs merge=lfs -text
82
+ outputs/depthmap-17280589390012.png filter=lfs diff=lfs merge=lfs -text
83
+ outputs/depthmap-17280589390013-left-right.png filter=lfs diff=lfs merge=lfs -text
84
+ outputs/depthmap-17280589390015-left-right.png filter=lfs diff=lfs merge=lfs -text
85
+ outputs/depthmap-17280589390017-left-right.png filter=lfs diff=lfs merge=lfs -text
86
+ outputs/depthmap-17280589390023-left-right.png filter=lfs diff=lfs merge=lfs -text
87
+ outputs/depthmap-17280589390025-left-right.png filter=lfs diff=lfs merge=lfs -text
88
+ outputs/depthmap-17280589390027-left-right.png filter=lfs diff=lfs merge=lfs -text
89
+ outputs/depthmap-17280589390029-left-right.png filter=lfs diff=lfs merge=lfs -text
90
+ outputs/depthmap-17280589390031-left-right.png filter=lfs diff=lfs merge=lfs -text
91
+ outputs/depthmap-17280589390033-left-right.png filter=lfs diff=lfs merge=lfs -text
92
+ outputs/depthmap-17280589390035-left-right.png filter=lfs diff=lfs merge=lfs -text
93
+ outputs/depthmap-17280589390037-left-right.png filter=lfs diff=lfs merge=lfs -text
94
+ outputs/depthmap-17280589390039-left-right.png filter=lfs diff=lfs merge=lfs -text
95
+ outputs/depthmap-17280589390041-left-right.png filter=lfs diff=lfs merge=lfs -text
96
+ outputs/depthmap-17280589390043-left-right.png filter=lfs diff=lfs merge=lfs -text
97
+ outputs/depthmap-17280589390045-left-right.png filter=lfs diff=lfs merge=lfs -text
98
+ outputs/depthmap-17280589390047-left-right.png filter=lfs diff=lfs merge=lfs -text
99
+ outputs/depthmap-17280589390049-left-right.png filter=lfs diff=lfs merge=lfs -text
100
+ outputs/depthmap-17280589390051-left-right.png filter=lfs diff=lfs merge=lfs -text
101
+ outputs/depthmap-17280589390052.png filter=lfs diff=lfs merge=lfs -text
102
+ outputs/depthmap-17280589390053-left-right.png filter=lfs diff=lfs merge=lfs -text
103
+ outputs/depthmap-17280589390054.png filter=lfs diff=lfs merge=lfs -text
104
+ outputs/depthmap-17280589390055-left-right.png filter=lfs diff=lfs merge=lfs -text
105
+ outputs/depthmap-17280589390056.png filter=lfs diff=lfs merge=lfs -text
106
+ outputs/depthmap-17280589390057-left-right.png filter=lfs diff=lfs merge=lfs -text
107
+ outputs/depthmap-17280589390058.png filter=lfs diff=lfs merge=lfs -text
108
+ outputs/depthmap-17280589390059-left-right.png filter=lfs diff=lfs merge=lfs -text
109
+ outputs/depthmap-17280589390061-left-right.png filter=lfs diff=lfs merge=lfs -text
110
+ outputs/depthmap-17280589390063-left-right.png filter=lfs diff=lfs merge=lfs -text
111
+ outputs/depthmap-17280589390065-left-right.png filter=lfs diff=lfs merge=lfs -text
112
+ outputs/depthmap-17280589390067-left-right.png filter=lfs diff=lfs merge=lfs -text
113
+ outputs/depthmap-17280589390069-left-right.png filter=lfs diff=lfs merge=lfs -text
114
+ outputs/depthmap-17280589390071-left-right.png filter=lfs diff=lfs merge=lfs -text
115
+ outputs/depthmap-17280589390073-left-right.png filter=lfs diff=lfs merge=lfs -text
116
+ outputs/depthmap-17280589390075-left-right.png filter=lfs diff=lfs merge=lfs -text
117
+ outputs/depthmap-17280589390077-left-right.png filter=lfs diff=lfs merge=lfs -text
118
+ outputs/depthmap-17280589390079-left-right.png filter=lfs diff=lfs merge=lfs -text
119
+ outputs/depthmap-17280589390081-left-right.png filter=lfs diff=lfs merge=lfs -text
120
+ outputs/depthmap-17280589390085-left-right.png filter=lfs diff=lfs merge=lfs -text
121
+ outputs/depthmap-17280589390087-left-right.png filter=lfs diff=lfs merge=lfs -text
122
+ outputs/depthmap-17280589390089-left-right.png filter=lfs diff=lfs merge=lfs -text
123
+ outputs/depthmap-17285060200001.png filter=lfs diff=lfs merge=lfs -text
124
+ outputs/depthmap-17285060200002-left-right.png filter=lfs diff=lfs merge=lfs -text
125
+ outputs/depthmap-17285060200003-top-bottom.png filter=lfs diff=lfs merge=lfs -text
126
+ outputs/depthmap-17285371260002-left-right.png filter=lfs diff=lfs merge=lfs -text
127
+ outputs/depthmap-17285859980001.png filter=lfs diff=lfs merge=lfs -text
128
+ outputs/depthmap-17285859980002.png filter=lfs diff=lfs merge=lfs -text
129
+ outputs/depthmap-17285859980003-left-right.png filter=lfs diff=lfs merge=lfs -text
130
+ outputs/depthmap-17285861380002-left-right.png filter=lfs diff=lfs merge=lfs -text
131
+ outputs/depthmap-17285861380003-left-right_video.avi filter=lfs diff=lfs merge=lfs -text
132
+ outputs/depthmap-17285861380004-depth_video.avi filter=lfs diff=lfs merge=lfs -text
133
+ outputs/depthmap-17286927930002-left-right.png filter=lfs diff=lfs merge=lfs -text
134
+ outputs/depthmap-17286927930003.png filter=lfs diff=lfs merge=lfs -text
135
+ outputs/depthmap-17286927930004-left-right.png filter=lfs diff=lfs merge=lfs -text
136
+ outputs/depthmap-17286927930005.png filter=lfs diff=lfs merge=lfs -text
137
+ outputs/depthmap-17286927930006-left-right.png filter=lfs diff=lfs merge=lfs -text
138
+ outputs/depthmap-17286927930010-left-right.png filter=lfs diff=lfs merge=lfs -text
139
+ outputs/depthmap-17286927930012-left-right.png filter=lfs diff=lfs merge=lfs -text
140
+ outputs/depthmap-17286927930016-left-right.png filter=lfs diff=lfs merge=lfs -text
141
+ outputs/depthmap-17286927930018-left-right.png filter=lfs diff=lfs merge=lfs -text
142
+ outputs/depthmap-17286927930020-left-right.png filter=lfs diff=lfs merge=lfs -text
143
+ outputs/depthmap-17286927930026-left-right.png filter=lfs diff=lfs merge=lfs -text
144
+ outputs/depthmap-17286927930028-left-right.png filter=lfs diff=lfs merge=lfs -text
145
+ outputs/depthmap-17286927930036-left-right.png filter=lfs diff=lfs merge=lfs -text
146
+ outputs/depthmap-17286927930046-left-right.png filter=lfs diff=lfs merge=lfs -text
147
+ outputs/depthmap-17286927930050-left-right.png filter=lfs diff=lfs merge=lfs -text
148
+ outputs/depthmap-17286927930052-left-right.png filter=lfs diff=lfs merge=lfs -text
149
+ outputs/depthmap-17286927930053.png filter=lfs diff=lfs merge=lfs -text
150
+ outputs/depthmap-17286927930054-left-right.png filter=lfs diff=lfs merge=lfs -text
151
+ outputs/depthmap-17286927930055.png filter=lfs diff=lfs merge=lfs -text
152
+ outputs/depthmap-17286927930056-left-right.png filter=lfs diff=lfs merge=lfs -text
153
+ outputs/depthmap-17286927930057.png filter=lfs diff=lfs merge=lfs -text
154
+ outputs/depthmap-17286927930058-left-right.png filter=lfs diff=lfs merge=lfs -text
155
+ outputs/depthmap-17286927930059.png filter=lfs diff=lfs merge=lfs -text
156
+ outputs/depthmap-17286927930060-left-right.png filter=lfs diff=lfs merge=lfs -text
157
+ outputs/depthmap-17286927930061.png filter=lfs diff=lfs merge=lfs -text
158
+ outputs/depthmap-17286927930062-left-right.png filter=lfs diff=lfs merge=lfs -text
159
+ outputs/depthmap-17286927930063.png filter=lfs diff=lfs merge=lfs -text
160
+ outputs/depthmap-17286927930064-left-right.png filter=lfs diff=lfs merge=lfs -text
161
+ outputs/depthmap-17286927930066-left-right.png filter=lfs diff=lfs merge=lfs -text
162
+ outputs/depthmap-17286927930070-left-right.png filter=lfs diff=lfs merge=lfs -text
163
+ outputs/depthmap-17286927930072-left-right.png filter=lfs diff=lfs merge=lfs -text
164
+ outputs/depthmap-17286927930080-left-right.png filter=lfs diff=lfs merge=lfs -text
165
+ outputs/depthmap-17286927930082-left-right.png filter=lfs diff=lfs merge=lfs -text
166
+ outputs/depthmap-17286927930084-left-right.png filter=lfs diff=lfs merge=lfs -text
167
+ outputs/depthmap-17286927930120-left-right.png filter=lfs diff=lfs merge=lfs -text
168
+ outputs/depthmap-17286927930126-left-right.png filter=lfs diff=lfs merge=lfs -text
169
+ outputs/depthmap-17286927930132-left-right.png filter=lfs diff=lfs merge=lfs -text
170
+ outputs/depthmap-17286927930142-left-right.png filter=lfs diff=lfs merge=lfs -text
171
+ outputs/depthmap-17286927930147.png filter=lfs diff=lfs merge=lfs -text
172
+ outputs/depthmap-17286927930152-left-right.png filter=lfs diff=lfs merge=lfs -text
173
+ outputs/depthmap-17286927930154-left-right.png filter=lfs diff=lfs merge=lfs -text
174
+ outputs/depthmap-17286927930156-left-right.png filter=lfs diff=lfs merge=lfs -text
175
+ outputs/depthmap-17286927930158-left-right.png filter=lfs diff=lfs merge=lfs -text
176
+ outputs/depthmap-17286927930160-left-right.png filter=lfs diff=lfs merge=lfs -text
177
+ outputs/depthmap-17286927930162-left-right.png filter=lfs diff=lfs merge=lfs -text
178
+ outputs/depthmap-17286927930164-left-right.png filter=lfs diff=lfs merge=lfs -text
179
+ outputs/depthmap-17286927930166-left-right.png filter=lfs diff=lfs merge=lfs -text
180
+ outputs/depthmap-17286927930168-left-right.png filter=lfs diff=lfs merge=lfs -text
181
+ outputs/depthmap-17286927930170-left-right.png filter=lfs diff=lfs merge=lfs -text
182
+ outputs/depthmap-17286927930172-left-right.png filter=lfs diff=lfs merge=lfs -text
183
+ outputs/depthmap-17286927930174-left-right.png filter=lfs diff=lfs merge=lfs -text
184
+ outputs/depthmap-17286927930176-left-right.png filter=lfs diff=lfs merge=lfs -text
185
+ outputs/depthmap-17286927930178-left-right.png filter=lfs diff=lfs merge=lfs -text
186
+ outputs/depthmap-17286927930180-left-right.png filter=lfs diff=lfs merge=lfs -text
187
+ outputs/depthmap-17286927930182-left-right.png filter=lfs diff=lfs merge=lfs -text
188
+ outputs/depthmap-17286927930184-left-right.png filter=lfs diff=lfs merge=lfs -text
189
+ outputs/depthmap-17286927930186-left-right.png filter=lfs diff=lfs merge=lfs -text
190
+ outputs/depthmap-17286927930188-left-right.png filter=lfs diff=lfs merge=lfs -text
191
+ outputs/depthmap-17286927930190-left-right.png filter=lfs diff=lfs merge=lfs -text
192
+ outputs/depthmap-17286927930194-left-right.png filter=lfs diff=lfs merge=lfs -text
193
+ outputs/depthmap-17286927930196-left-right.png filter=lfs diff=lfs merge=lfs -text
194
+ outputs/depthmap-17286927930198-left-right.png filter=lfs diff=lfs merge=lfs -text
195
+ outputs/depthmap-17286927930199.png filter=lfs diff=lfs merge=lfs -text
196
+ outputs/depthmap-17286927930200-left-right.png filter=lfs diff=lfs merge=lfs -text
197
+ outputs/depthmap-17286927930202-left-right.png filter=lfs diff=lfs merge=lfs -text
198
+ outputs/depthmap-17286927930204-left-right.png filter=lfs diff=lfs merge=lfs -text
199
+ outputs/depthmap-17286927930206-left-right.png filter=lfs diff=lfs merge=lfs -text
200
+ outputs/depthmap-17286927930208-left-right.png filter=lfs diff=lfs merge=lfs -text
201
+ outputs/depthmap-17286927930210-left-right.png filter=lfs diff=lfs merge=lfs -text
202
+ outputs/depthmap-17286927930212-left-right.png filter=lfs diff=lfs merge=lfs -text
203
+ outputs/depthmap-17286927930213.png filter=lfs diff=lfs merge=lfs -text
204
+ outputs/depthmap-17286927930214-left-right.png filter=lfs diff=lfs merge=lfs -text
205
+ outputs/depthmap-17286927930216-left-right.png filter=lfs diff=lfs merge=lfs -text
206
+ outputs/depthmap-17286927930218-left-right.png filter=lfs diff=lfs merge=lfs -text
207
+ outputs/depthmap-17286927930220-left-right.png filter=lfs diff=lfs merge=lfs -text
208
+ outputs/depthmap-17286927930222-left-right.png filter=lfs diff=lfs merge=lfs -text
209
+ outputs/depthmap-17286927930224-left-right.png filter=lfs diff=lfs merge=lfs -text
210
+ outputs/depthmap-17286927930226-left-right.png filter=lfs diff=lfs merge=lfs -text
211
+ outputs/depthmap-17286927930228-left-right.png filter=lfs diff=lfs merge=lfs -text
212
+ outputs/depthmap-17286927930230-left-right.png filter=lfs diff=lfs merge=lfs -text
213
+ outputs/depthmap-17286927930232-left-right.png filter=lfs diff=lfs merge=lfs -text
214
+ outputs/depthmap-17286927930234-left-right.png filter=lfs diff=lfs merge=lfs -text
215
+ outputs/depthmap-17286927930236-left-right.png filter=lfs diff=lfs merge=lfs -text
216
+ outputs/depthmap-17286927930238-left-right.png filter=lfs diff=lfs merge=lfs -text
217
+ outputs/depthmap-17286927930240-left-right.png filter=lfs diff=lfs merge=lfs -text
218
+ outputs/depthmap-17286927930242-left-right.png filter=lfs diff=lfs merge=lfs -text
219
+ outputs/depthmap-17286927930244-left-right.png filter=lfs diff=lfs merge=lfs -text
220
+ outputs/depthmap-17286927930246-left-right.png filter=lfs diff=lfs merge=lfs -text
221
+ outputs/depthmap-17286927930248-left-right.png filter=lfs diff=lfs merge=lfs -text
222
+ outputs/depthmap-17286927930250-left-right.png filter=lfs diff=lfs merge=lfs -text
223
+ outputs/depthmap-17286927930252-left-right.png filter=lfs diff=lfs merge=lfs -text
224
+ outputs/depthmap-17286927930253.png filter=lfs diff=lfs merge=lfs -text
225
+ outputs/depthmap-17286927930254-left-right.png filter=lfs diff=lfs merge=lfs -text
226
+ outputs/depthmap-17286927930256-left-right.png filter=lfs diff=lfs merge=lfs -text
227
+ outputs/depthmap-17286927930258-left-right.png filter=lfs diff=lfs merge=lfs -text
228
+ outputs/depthmap-17286927930260-left-right.png filter=lfs diff=lfs merge=lfs -text
229
+ outputs/depthmap-17286927930261.png filter=lfs diff=lfs merge=lfs -text
230
+ outputs/depthmap-17286927930262-left-right.png filter=lfs diff=lfs merge=lfs -text
231
+ outputs/depthmap-17286927930263.png filter=lfs diff=lfs merge=lfs -text
232
+ outputs/depthmap-17286927930264-left-right.png filter=lfs diff=lfs merge=lfs -text
233
+ outputs/depthmap-17286927930265.png filter=lfs diff=lfs merge=lfs -text
234
+ outputs/depthmap-17286927930266-left-right.png filter=lfs diff=lfs merge=lfs -text
235
+ outputs/depthmap-17286927930268-left-right.png filter=lfs diff=lfs merge=lfs -text
236
+ outputs/depthmap-17286927930270-left-right.png filter=lfs diff=lfs merge=lfs -text
237
+ outputs/depthmap-17286927930272-left-right.png filter=lfs diff=lfs merge=lfs -text
238
+ outputs/depthmap-17286927930274-left-right.png filter=lfs diff=lfs merge=lfs -text
239
+ outputs/depthmap-17286927930276-left-right.png filter=lfs diff=lfs merge=lfs -text
240
+ outputs/depthmap-17286927930278-left-right.png filter=lfs diff=lfs merge=lfs -text
241
+ outputs/depthmap-17286927930280-left-right.png filter=lfs diff=lfs merge=lfs -text
242
+ outputs/depthmap-17286927930282-left-right.png filter=lfs diff=lfs merge=lfs -text
243
+ outputs/depthmap-17286927930284-left-right.png filter=lfs diff=lfs merge=lfs -text
244
+ outputs/depthmap-17286927930286-left-right.png filter=lfs diff=lfs merge=lfs -text
245
+ outputs/depthmap-17286927930288-left-right.png filter=lfs diff=lfs merge=lfs -text
246
+ outputs/depthmap-17286927930290-left-right.png filter=lfs diff=lfs merge=lfs -text
247
+ outputs/depthmap-17286927930292-left-right.png filter=lfs diff=lfs merge=lfs -text
248
+ outputs/depthmap-17286927930294-left-right.png filter=lfs diff=lfs merge=lfs -text
249
+ outputs/depthmap-17286927930296-left-right.png filter=lfs diff=lfs merge=lfs -text
250
+ outputs/depthmap-17286927930298-left-right.png filter=lfs diff=lfs merge=lfs -text
251
+ outputs/depthmap-17286927930300-left-right.png filter=lfs diff=lfs merge=lfs -text
252
+ outputs/depthmap-17286927930302-left-right.png filter=lfs diff=lfs merge=lfs -text
253
+ outputs/depthmap-17286927930304-left-right.png filter=lfs diff=lfs merge=lfs -text
254
+ outputs/depthmap-17286927930306-left-right.png filter=lfs diff=lfs merge=lfs -text
255
+ outputs/depthmap-17286927930308-left-right.png filter=lfs diff=lfs merge=lfs -text
256
+ outputs/depthmap-17286927930310-left-right.png filter=lfs diff=lfs merge=lfs -text
257
+ outputs/depthmap-17286927930312-left-right.png filter=lfs diff=lfs merge=lfs -text
258
+ outputs/depthmap-17286927930316-left-right.png filter=lfs diff=lfs merge=lfs -text
259
+ outputs/depthmap-17286927930318-left-right.png filter=lfs diff=lfs merge=lfs -text
260
+ outputs/depthmap-17286927930322-left-right.png filter=lfs diff=lfs merge=lfs -text
261
+ outputs/depthmap-17286927930324-left-right.png filter=lfs diff=lfs merge=lfs -text
262
+ outputs/depthmap-17286927930326-left-right.png filter=lfs diff=lfs merge=lfs -text
263
+ outputs/depthmap-17286927930328-left-right.png filter=lfs diff=lfs merge=lfs -text
264
+ outputs/depthmap-17286927930330-left-right.png filter=lfs diff=lfs merge=lfs -text
265
+ outputs/depthmap-17286927930332-left-right.png filter=lfs diff=lfs merge=lfs -text
266
+ outputs/depthmap-17286927930334-left-right.png filter=lfs diff=lfs merge=lfs -text
267
+ outputs/depthmap-17286927930336-left-right.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ __pycache__/
2
+ venv/
3
+ .idea/
CHANGELOG.md ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Changelog
2
+ ### 0.4.8
3
+ * Depth Anything V2 support, thanks [@graemeniedermayer](https://github.com/graemeniedermayer)!
4
+ ### 0.4.7
5
+ * Tiling mode
6
+ * Reduced VRAM consumption for Depth Anything, as well as for ZoeDepth k and nk
7
+ * Some bugfixes
8
+ ### 0.4.6
9
+ * Support for [Depth Anything](https://github.com/LiheYoung/Depth-Anything).
10
+ ### 0.4.5
11
+ * Preliminary support for [Marigold](https://marigoldmonodepth.github.io). [PR #385](https://github.com/thygate/stable-diffusion-webui-depthmap-script/pull/385).
12
+ ### 0.4.4
13
+ * Compatibility with stable-diffusion-webui 1.6.0
14
+ ### 0.4.3 video processing tab
15
+ * Added an option to process videos directly from a video file. This leads to better results than batch-processing individual frames of a video. Allows generating depthmap videos, that can be used in further generations as custom depthmap videos.
16
+ * UI improvements.
17
+ * Extra stereoimage generation modes - enable in extension settings if you want to use them.
18
+ * New stereoimage generation parameter - offset exponent. Setting it to 1 may produce more realistic outputs.
19
+ ### 0.4.2
20
+ * Added UI options for 2 additional rembg models.
21
+ * Heatmap generation UI option is hidden - if you want to use it, please activate it in the extension settings.
22
+ * Bugfixes.
23
+ ### 0.4.1 standalone mode
24
+ * Added ability to run DepthMap without WebUI. (Use main.py. Make sure all the dependencies are installed. The support is not feature-complete.)
25
+ * NormalMap generation
26
+ ### 0.4.0 large code refactor
27
+ * UI improvements
28
+ * Improved Batch from Directory, Clip and renormalize DepthMap
29
+ * Slightly changed the behaviour of various options
30
+ * Extension may partially work even if some of the dependencies are unmet
31
+
32
+ ### 0.3.12
33
+ * Fixed stereo image generation
34
+ * Other bugfixes
35
+ ### 0.3.11
36
+ * 3D model viewer (Experimental!)
37
+ * simple and fast (occluded) 3D mesh generation, support for equirectangular projection
38
+ (accurate results with ZoeDepth models only, no boost, no custom maps)
39
+ * default output format is now obj for inpainted mesh and simple mesh
40
+ ### 0.3.10
41
+ * ZoeDepth support (with boost), 3 new models, best results so far
42
+ * better heatmap
43
+ ### 0.3.9
44
+ * use existing/custom depthmaps in output dir for batch mode
45
+ * custom depthmap support for single file
46
+ * wavefront obj output support for inpainted mesh (enabled in settings)
47
+ * option to generate all stereo formats at once
48
+ * bugfix: convert single channel input image to rgb
49
+ * renamed midas imports to fix conflict with deforum
50
+ * ui cleanup
51
+ ### 0.3.8 bugfix
52
+ * bugfix in remove background path
53
+ ### 0.3.7 new features
54
+ * [rembg](https://github.com/danielgatis/rembg) Remove Background [PR](https://github.com/thygate/stable-diffusion-webui-depthmap-script/pull/78) by [@graemeniedermayer](https://github.com/graemeniedermayer) merged
55
+ * setting to flip Left/Right SBS images
56
+ * added missing parameter for 3d inpainting (repeat_inpaint_edge)
57
+ * option to generate demo videos with mesh
58
+ ### 0.3.6 new feature
59
+ * implemented binary ply file format for the inpainted 3D mesh, big reduction in filesize and save/load times.
60
+ * added progress indicators to the inpainting process
61
+ ### 0.3.5 bugfix
62
+ * create path to 3dphoto models before download (see [issue](https://github.com/thygate/stable-diffusion-webui-depthmap-script/issues/76))
63
+ ### 0.3.4 new featues
64
+ * depth clipping option (original idea by [@Extraltodeus](https://github.com/Extraltodeus))
65
+ * by popular demand, 3D-Photo-Inpainting is now implemented
66
+ * generate inpainted 3D mesh (PLY) and videos of said mesh
67
+ ### 0.3.3 bugfix and new midas models
68
+ * updated to midas 3.1, bringing 2 new depth models (the 512 one eats VRAM for breakfast!)
69
+ * fix Next-ViT dependency issue for new installs
70
+ * extension no longer clones repositories, all dependencies are now contained in the extension
71
+ ### 0.3.2 new feature and bugfixes
72
+ * several bug fixes for apple silicon and other machines without cuda
73
+ * NEW Stereo Image Generation techniques for gap filling by [@semjon00](https://github.com/semjon00) using polylines. (See [here](https://github.com/thygate/stable-diffusion-webui-depthmap-script/pull/56)) Significant improvement in quality.
74
+ ### 0.3.1 bugfix
75
+ * small speed increase for anaglyph creation
76
+ * clone midas repo before midas 3.1 to fix issue (see [here](https://github.com/thygate/stable-diffusion-webui-depthmap-script/issues/55#issue-1510266008))
77
+ ### 0.3.0 improved stereo image generation
78
+ * New improved technique for generating stereo images and balancing distortion between eyes by [@semjon00](https://github.com/semjon00) (See [here](https://github.com/thygate/stable-diffusion-webui-depthmap-script/pull/51))
79
+ * Substantial speedup of stereo image generation code using numba JIT
80
+ ### 0.2.9 new feature
81
+ * 3D Stereo (side-by-side) and red/cyan anaglyph image generation.
82
+ (Thanks to [@sina-masoud-ansari](https://github.com/sina-masoud-ansari) for the tip! Discussion [here](https://github.com/thygate/stable-diffusion-webui-depthmap-script/discussions/45))
83
+ ### 0.2.8 bugfix
84
+ * boost (pix2pix) now also able to compute on cpu
85
+ * res101 able to compute on cpu
86
+ ### 0.2.7 separate tab
87
+ * Depth Tab now available for easier stand-alone (batch) processing
88
+ ### 0.2.6 ui layout and settings
89
+ * added link to repo so more people find their way to the instructions.
90
+ * boost rmax setting
91
+ ### 0.2.5 bugfix
92
+ * error checking on model download (now with progressbar)
93
+ ### 0.2.4 high resolution depthmaps
94
+ * multi-resolution merging is now implemented, significantly improving results!
95
+ * res101 can now also compute on CPU
96
+ ### 0.2.3 bugfix
97
+ * path error on linux fixed
98
+ ### 0.2.2 new features
99
+ * added (experimental) support for AdelaiDepth/LeReS (GPU Only!)
100
+ * new option to view depthmap as heatmap
101
+ * optimised ui layout
102
+ ### 0.2.1 bugfix
103
+ * Correct seed is now used in filename and pnginfo when running batches. (see [issue](https://github.com/thygate/stable-diffusion-webui-depthmap-script/issues/35))
104
+ ### 0.2.0 upgrade
105
+ * the script is now an extension, enabling auto installation.
106
+ ### 0.1.9 bugfixes
107
+ * sd model moved to system memory while computing depthmap
108
+ * memory leak/fragmentation issue fixed
109
+ * recover from out of memory error
110
+ ### 0.1.8 new options
111
+ * net size can now be set as width and height, option to match input size, sliders now have the same range as generation parameters. (see usage below)
112
+ * better error handling
113
+ ### 0.1.7 bugfixes
114
+ * batch img2img now works (see [issue](https://github.com/thygate/stable-diffusion-webui-depthmap-script/issues/21#issuecomment-1306445056))
115
+ * generation parameters now only saved when enabled in settings
116
+ * model memory freed explicitly at end of script
117
+ ### 0.1.6 new option
118
+ * option to invert depthmap (black=near, white=far), as required by some viewers.
119
+ ### 0.1.5 bugfix
120
+ * saving as any format other than PNG now always produces an 8 bit, 3 channel RGB image. A single channel 16 bit image is only supported when saving as PNG. (see [issue](https://github.com/thygate/stable-diffusion-webui-depthmap-script/issues/15#issuecomment-1304909019))
121
+ ### 0.1.4 update
122
+ * added support for `--no-half`. Now also works with cards that don't support half precision like GTX 16xx. ([verified](https://github.com/thygate/stable-diffusion-webui-depthmap-script/issues/12#issuecomment-1304656398))
123
+ ### 0.1.3 bugfix
124
+ * bugfix where some controls where not visible (see [issue](https://github.com/thygate/stable-diffusion-webui-depthmap-script/issues/11#issuecomment-1304400537))
125
+ ### 0.1.2 new option
126
+ * network size slider. higher resolution depth maps (see usage below)
127
+ ### 0.1.1 bugfixes
128
+ * overflow issue (see [here](https://github.com/thygate/stable-diffusion-webui-depthmap-script/issues/10) for details and examples of artifacts)
129
+ * when not combining, depthmap is now saved as single channel 16 bit
130
+ ### 0.1.0
131
+ * initial version: script mode, supports generating depthmaps with 4 different midas models
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Bob Thiry
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,12 +1,244 @@
1
- ---
2
- title: Stable Diffusion Webui Depthmap Script
3
- emoji: 🔥
4
- colorFrom: pink
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 5.0.2
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: stable-diffusion-webui-depthmap-script
3
+ app_file: main.py
4
+ sdk: gradio
5
+ sdk_version: 3.50.2
6
+ ---
7
+ # High Resolution Depth Maps for Stable Diffusion WebUI
8
+ This program is an addon for [AUTOMATIC1111's Stable Diffusion WebUI](https://github.com/AUTOMATIC1111/stable-diffusion-webui) that creates depth maps. Using either generated or custom depth maps, it can also create 3D stereo image pairs (side-by-side or anaglyph), normalmaps and 3D meshes. The outputs of the script can be viewed directly or used as an asset for a 3D engine. Please see [wiki](https://github.com/thygate/stable-diffusion-webui-depthmap-script/wiki/Viewing-Results) to learn more. The program has integration with [Rembg](https://github.com/danielgatis/rembg). It also supports batch processing, processing of videos, and can also be run in standalone mode, without Stable Diffusion WebUI.
9
+
10
+ To generate realistic depth maps from individual images, this script uses code and models from the [Marigold](https://github.com/prs-eth/Marigold/) repository, from the [MiDaS](https://github.com/isl-org/MiDaS) and [ZoeDepth](https://github.com/isl-org/ZoeDepth) repositories by Intel ISL, or LeReS from the [AdelaiDepth](https://github.com/aim-uofa/AdelaiDepth) repository by Advanced Intelligent Machines. Multi-resolution merging as implemented by [BoostingMonocularDepth](https://github.com/compphoto/BoostingMonocularDepth) is used to generate high resolution depth maps.
11
+
12
+ Stereoscopic images are created using a custom-written algorithm.
13
+
14
+ 3D Photography using Context-aware Layered Depth Inpainting by Virginia Tech Vision and Learning Lab, or [3D-Photo-Inpainting](https://github.com/vt-vl-lab/3d-photo-inpainting) is used to generate a `3D inpainted mesh` and render `videos` from said mesh.
15
+
16
+ Rembg uses [U-2-Net](https://github.com/xuebinqin/U-2-Net) and [IS-Net](https://github.com/xuebinqin/DIS).
17
+
18
+ ## Depthmap Examples
19
+ [![screenshot](examples.png)](https://raw.githubusercontent.com/thygate/stable-diffusion-webui-depthmap-script/main/examples.png)
20
+
21
+ ## 3D Photo Inpainting Examples
22
+ [![video](https://img.youtube.com/vi/jRmVkIMS-SY/0.jpg)](https://www.youtube.com/watch?v=jRmVkIMS-SY)
23
+ video by [@graemeniedermayer](https://github.com/graemeniedermayer), more examples [here](https://github.com/thygate/stable-diffusion-webui-depthmap-script/discussions/50)
24
+
25
+ ## Stereo Image SBS and Anaglyph Examples
26
+ ![](https://user-images.githubusercontent.com/54073010/210012661-ef07986c-2320-4700-bc54-fad3899f0186.png)
27
+ images generated by [@semjon00](https://github.com/semjon00) from CC0 photos, more examples [here](https://github.com/thygate/stable-diffusion-webui-depthmap-script/pull/56#issuecomment-1367596463).
28
+
29
+ ## Install instructions
30
+ ### As extension
31
+ The script can be installed directly from WebUI. Please navigate to `Extensions` tab, then click `Available`, `Load from` and then install the `Depth Maps` extension. Alternatively, the extension can be installed from the URL: `https://github.com/thygate/stable-diffusion-webui-depthmap-script`.
32
+
33
+ ### Updating
34
+ In the WebUI, in the `Extensions` tab, in the `Installed` subtab, click `Check for Updates` and then `Apply and restart UI`.
35
+
36
+ ### Standalone
37
+ Clone the repository, install the requirements from `requirements.txt`, launch using `main.py`.
38
+
39
+ >Model weights will be downloaded automatically on their first use and saved to /models/midas, /models/leres and /models/pix2pix. Zoedepth models are stored in the torch cache folder.
40
+
41
+
42
+ ## Usage
43
+ Select the "DepthMap" script from the script selection box in either txt2img or img2img, or go to the Depth tab when using existing images.
44
+ ![screenshot](options.png)
45
+
46
+ The models can `Compute on` GPU and CPU, use CPU if low on VRAM.
47
+
48
+ There are ten models available from the `Model` dropdown. For the first model, res101, see [AdelaiDepth/LeReS](https://github.com/aim-uofa/AdelaiDepth/tree/main/LeReS) for more info. The others are the midas models: dpt_beit_large_512, dpt_beit_large_384, dpt_large_384, dpt_hybrid_384, midas_v21, and midas_v21_small. See the [MiDaS](https://github.com/isl-org/MiDaS) repository for more info. The newest dpt_beit_large_512 model was trained on a 512x512 dataset but is VERY VRAM hungry. The last three models are [ZoeDepth](https://github.com/isl-org/ZoeDepth) models.
49
+
50
+ Net size can be set with `net width` and `net height`, or will be the same as the input image when `Match input size` is enabled. There is a trade-off between structural consistency and high-frequency details with respect to net size (see [observations](https://github.com/compphoto/BoostingMonocularDepth#observations)).
51
+
52
+ `Boost` will enable multi-resolution merging as implemented by [BoostingMonocularDepth](https://github.com/compphoto/BoostingMonocularDepth) and will significantly improve the results, mitigating the observations mentioned above, at the cost of much larger compute time. Best results with res101.
53
+
54
+ `Clip and renormalize` allows for clipping the depthmap on the `near` and `far` side, the values in between will be renormalized to fit the available range. Set both values equal to get a b&w mask of a single depth plane at that value. This option works on the 16-bit depthmap and allows for 1000 steps to select the clip values.
55
+
56
+ When enabled, `Invert DepthMap` will result in a depthmap with black near and white far.
57
+
58
+ Regardless of global settings, `Save DepthMap` will always save the depthmap in the default txt2img or img2img directory with the filename suffix '_depth'. Generation parameters are saved with the image if enabled in settings. Files generated from the Depth tab are saved in the default extras-images directory.
59
+
60
+ To see the generated output in the webui `Show DepthMap` should be enabled. When using Batch img2img this option should also be enabled.
61
+
62
+ When `Combine into one image` is enabled, the depthmap will be combined with the original image, the orientation can be selected with `Combine axis`. When disabled, the depthmap will be saved as a 16 bit single channel PNG as opposed to a three channel (RGB), 8 bit per channel image when the option is enabled.
63
+
64
+ When either `Generate Stereo` or `Generate anaglyph` is enabled, a stereo image pair will be generated. `Divergence` sets the amount of 3D effect that is desired. `Balance between eyes` determines where the (inevitable) distortion from filling up gaps will end up, -1 Left, +1 Right, and 0 balanced.
65
+ The different `Gap fill technique` options are : none (no gaps are filled),
66
+ naive (the original method), naive_interpolating (the original method with interpolation), polylines_soft and polylines_sharp are the latest technique, the last one being best quality and slowest. Note: All stereo image generation is done on CPU.
67
+
68
+ To generate the mesh required to generate videos, enable `Generate 3D inpainted mesh`. This can be a lengthy process, from a few minutes for small images to an hour for very large images. This option is only available on the Depth tab. When enabled, the mesh in ply format and four demo video are generated. All files are saved to the extras directory.
69
+
70
+ Videos can be generated from the PLY mesh on the Depth Tab.
71
+ It requires the mesh created by this extension, files created elsewhere might not work corectly, as some extra info is stored in the file (required value for dolly). Most options are self-explanatory, like `Number of frames` and `Framerate`. Two output `formats` are supported: mp4 and webm. Supersampling Anti-Aliasing (SSAA) can be used to get rid of jagged edges and flickering. The render size is scaled by this factor and then downsampled.
72
+ There are three `trajectories` to choose from : circle, straight-line, double-straight-line, to `translate` in three dimensions. The border can be `cropped` on four sides, and the `Dolly` option adjusts the FOV so the center subject will stay approximately the same size, like the dolly-zoom.
73
+
74
+ Settings on WebUI Settings tab :
75
+ `Maximum wholesize for boost` sets the r_max value from the BoostingMonocularDepth paper, it relates to the max size that is chosen to render at internally, and directly influences the max amount of VRAM that could be used. The default value for this from the paper is 3000, I have lowered the value to 1600 so it will work more often with 8GB VRAM GPU's.
76
+ If you often get out of memory errors when computing a depthmap on GPU while using Boost, you can try lowering this value. Note the 'wholeImage being processed in : xxxx' output when using boost, this number will never be greater than the r_max, but can be larger with a larger r_max. See the paper for more details.
77
+
78
+ > 💡 Saving as any format other than PNG always produces an 8 bit, 3 channel RGB image. A single channel 16 bit image is only supported when saving as PNG.
79
+
80
+ ## FAQ
81
+
82
+ * `Can I use this on existing images ?`
83
+ - Yes, you can use the Depth tab to easily process existing images.
84
+ - Another way of doing this would be to use img2img with denoising strength to 0. This will effectively skip stable diffusion and use the input image. You will still have to set the correct size, and need to select `Crop and resize` instead of `Just resize` when the input image resolution does not match the set size perfectly.
85
+ * `Can I run this on Google Colab?`
86
+ - You can run the MiDaS network on their colab linked here https://pytorch.org/hub/intelisl_midas_v2/
87
+ - You can run BoostingMonocularDepth on their colab linked here : https://colab.research.google.com/github/compphoto/BoostingMonocularDepth/blob/main/Boostmonoculardepth.ipynb
88
+ - Running this program on Colab is not officially supported, but it may work. Please look for more suitable ways of running this. If you still decide to try, standalone installation may be easier to manage.
89
+ * `What other depth-related projects could I check out?`
90
+ - [DepthFlow](https://github.com/BrokenSource/DepthFlow) by [@Tremeschin](https://github.com/Tremeschin) for a very fast generation of 2.5D videos from images (no need to create mesh beforehand!)
91
+ - Several [scripts](https://github.com/Extraltodeus?tab=repositories) by [@Extraltodeus](https://github.com/Extraltodeus) using depth maps.
92
+ - geo-11, [Depth3D](https://github.com/BlueSkyDefender/Depth3D) and [Geo3D](https://github.com/Flugan/Geo3D-Installer) for playing existing games in 3D.
93
+ - (Feel free to suggest more projects in the discussions!)
94
+ * `How can I know what changed in the new version of the script?`
95
+ - You can see the git history log or refer to the `CHANGELOG.md` file.
96
+
97
+ ## Help wanted!
98
+ Developers wanted! Please help us fix the bugs and add new features by creating MRs.
99
+ All help is heavily appreciated.
100
+ Feel free to comment and share in the discussions and submit issues.
101
+
102
+ ## Acknowledgements
103
+
104
+ This project relies on code and information from the following papers :
105
+
106
+ MiDaS :
107
+
108
+ ```
109
+ @article {Ranftl2022,
110
+ author = "Ren\'{e} Ranftl and Katrin Lasinger and David Hafner and Konrad Schindler and Vladlen Koltun",
111
+ title = "Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-Shot Cross-Dataset Transfer",
112
+ journal = "IEEE Transactions on Pattern Analysis and Machine Intelligence",
113
+ year = "2022",
114
+ volume = "44",
115
+ number = "3"
116
+ }
117
+ ```
118
+
119
+ Dense Prediction Transformers, DPT-based model :
120
+
121
+ ```
122
+ @article{Ranftl2021,
123
+ author = {Ren\'{e} Ranftl and Alexey Bochkovskiy and Vladlen Koltun},
124
+ title = {Vision Transformers for Dense Prediction},
125
+ journal = {ICCV},
126
+ year = {2021},
127
+ }
128
+ ```
129
+
130
+ AdelaiDepth/LeReS :
131
+
132
+ ```
133
+ @article{yin2022towards,
134
+ title={Towards Accurate Reconstruction of 3D Scene Shape from A Single Monocular Image},
135
+ author={Yin, Wei and Zhang, Jianming and Wang, Oliver and Niklaus, Simon and Chen, Simon and Liu, Yifan and Shen, Chunhua},
136
+ journal={TPAMI},
137
+ year={2022}
138
+ }
139
+ @inproceedings{Wei2021CVPR,
140
+ title = {Learning to Recover 3D Scene Shape from a Single Image},
141
+ author = {Wei Yin and Jianming Zhang and Oliver Wang and Simon Niklaus and Long Mai and Simon Chen and Chunhua Shen},
142
+ booktitle = {Proc. IEEE Conf. Comp. Vis. Patt. Recogn. (CVPR)},
143
+ year = {2021}
144
+ }
145
+ ```
146
+
147
+ Boosting Monocular Depth Estimation Models to High-Resolution via Content-Adaptive Multi-Resolution Merging :
148
+
149
+ ```
150
+ @inproceedings{Miangoleh2021Boosting,
151
+ title={Boosting Monocular Depth Estimation Models to High-Resolution via Content-Adaptive Multi-Resolution Merging},
152
+ author={S. Mahdi H. Miangoleh and Sebastian Dille and Long Mai and Sylvain Paris and Ya\u{g}{\i}z Aksoy},
153
+ journal={Proc. CVPR},
154
+ year={2021},
155
+ }
156
+ ```
157
+
158
+ 3D Photography using Context-aware Layered Depth Inpainting :
159
+
160
+ ```
161
+ @inproceedings{Shih3DP20,
162
+ author = {Shih, Meng-Li and Su, Shih-Yang and Kopf, Johannes and Huang, Jia-Bin},
163
+ title = {3D Photography using Context-aware Layered Depth Inpainting},
164
+ booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
165
+ year = {2020}
166
+ }
167
+ ```
168
+
169
+ U2-Net:
170
+
171
+ ```
172
+ @InProceedings{Qin_2020_PR,
173
+ title = {U2-Net: Going Deeper with Nested U-Structure for Salient Object Detection},
174
+ author = {Qin, Xuebin and Zhang, Zichen and Huang, Chenyang and Dehghan, Masood and Zaiane, Osmar and Jagersand, Martin},
175
+ journal = {Pattern Recognition},
176
+ volume = {106},
177
+ pages = {107404},
178
+ year = {2020}
179
+ }
180
+ ```
181
+
182
+ IS-Net:
183
+
184
+ ```
185
+ @InProceedings{qin2022,
186
+ author={Xuebin Qin and Hang Dai and Xiaobin Hu and Deng-Ping Fan and Ling Shao and Luc Van Gool},
187
+ title={Highly Accurate Dichotomous Image Segmentation},
188
+ booktitle={ECCV},
189
+ year={2022}
190
+ }
191
+ ```
192
+
193
+
194
+ ZoeDepth :
195
+
196
+ ```
197
+ @misc{https://doi.org/10.48550/arxiv.2302.12288,
198
+ doi = {10.48550/ARXIV.2302.12288},
199
+ url = {https://arxiv.org/abs/2302.12288},
200
+ author = {Bhat, Shariq Farooq and Birkl, Reiner and Wofk, Diana and Wonka, Peter and Müller, Matthias},
201
+ keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
202
+ title = {ZoeDepth: Zero-shot Transfer by Combining Relative and Metric Depth},
203
+ publisher = {arXiv},
204
+ year = {2023},
205
+ copyright = {arXiv.org perpetual, non-exclusive license}
206
+ }
207
+ ```
208
+
209
+ Marigold - Repurposing Diffusion-Based Image Generators for Monocular Depth Estimation:
210
+
211
+ ```
212
+ @misc{ke2023repurposing,
213
+ title={Repurposing Diffusion-Based Image Generators for Monocular Depth Estimation},
214
+ author={Bingxin Ke and Anton Obukhov and Shengyu Huang and Nando Metzger and Rodrigo Caye Daudt and Konrad Schindler},
215
+ year={2023},
216
+ eprint={2312.02145},
217
+ archivePrefix={arXiv},
218
+ primaryClass={cs.CV}
219
+ }
220
+ ```
221
+
222
+ Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data
223
+
224
+ ```
225
+ @misc{yang2024depth,
226
+ title={Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data},
227
+ author={Lihe Yang and Bingyi Kang and Zilong Huang and Xiaogang Xu and Jiashi Feng and Hengshuang Zhao},
228
+ year={2024},
229
+ eprint={2401.10891},
230
+ archivePrefix={arXiv},
231
+ primaryClass={cs.CV}
232
+ }
233
+ ```
234
+
235
+ Depth Anything V2
236
+
237
+ ```bibtex
238
+ @article{depth_anything_v2,
239
+ title={Depth Anything V2},
240
+ author={Yang, Lihe and Kang, Bingyi and Huang, Zilong and Zhao, Zhen and Xu, Xiaogang and Feng, Jiashi and Zhao, Hengshuang},
241
+ journal={arXiv:2406.09414},
242
+ year={2024}
243
+ }
244
+ ```
__init__.py ADDED
File without changes
bundled_sources.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Since commit 110549b2 this extension bundles some code from other repositories.
2
+ This was done to prevent possible upstream breakage and allow fixing breakage quicker.
3
+ This file provides information about the original location of the code.
4
+ *** Some of the bundled code was already modified. ***
5
+
6
+ dmidas
7
+ https://github.com/isl-org/MiDaS/tree/master/midas/
8
+
9
+ dzoedepth
10
+ https://github.com/isl-org/ZoeDepth/tree/main/zoedepth/
11
+
12
+ inpaint
13
+ https://github.com/vt-vl-lab/3d-photo-inpainting/
14
+
15
+ lib
16
+ https://github.com/aim-uofa/AdelaiDepth/tree/main/LeReS/Minist_Test/lib/
17
+
18
+ pix2pix
19
+ https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/
20
+
21
+ Marigold
22
+ https://github.com/prs-eth/Marigold/tree/22437a
23
+
24
+ depth_anything_v2
25
+ https://github.com/DepthAnything/Depth-Anything-V2/tree/bc0283
ddepth_anything_v2/DA-2K.md ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DA-2K Evaluation Benchmark
2
+
3
+ ## Introduction
4
+
5
+ ![DA-2K](assets/DA-2K.png)
6
+
7
+ DA-2K is proposed in [Depth Anything V2](https://depth-anything-v2.github.io) to evaluate the relative depth estimation capability. It encompasses eight representative scenarios of `indoor`, `outdoor`, `non_real`, `transparent_reflective`, `adverse_style`, `aerial`, `underwater`, and `object`. It consists of 1K diverse high-quality images and 2K precise pair-wise relative depth annotations.
8
+
9
+ Please refer to our [paper](https://arxiv.org/abs/2406.09414) for details in constructing this benchmark.
10
+
11
+
12
+ ## Usage
13
+
14
+ Please first [download the benchmark](https://huggingface.co/datasets/depth-anything/DA-2K/tree/main).
15
+
16
+ All annotations are stored in `annotations.json`. The annotation file is a JSON object where each key is the path to an image file, and the value is a list of annotations associated with that image. Each annotation describes two points and identifies which point is closer to the camera. The structure is detailed below:
17
+
18
+ ```
19
+ {
20
+ "image_path": [
21
+ {
22
+ "point1": [h1, w1], # (vertical position, horizontal position)
23
+ "point2": [h2, w2], # (vertical position, horizontal position)
24
+ "closer_point": "point1" # we always set "point1" as the closer one
25
+ },
26
+ ...
27
+ ],
28
+ ...
29
+ }
30
+ ```
31
+
32
+ To visualize the annotations:
33
+ ```bash
34
+ python visualize.py [--scene-type <type>]
35
+ ```
36
+
37
+ **Options**
38
+ - `--scene-type <type>` (optional): Specify the scene type (`indoor`, `outdoor`, `non_real`, `transparent_reflective`, `adverse_style`, `aerial`, `underwater`, and `object`). Skip this argument or set <type> as `""` to include all scene types.
39
+
40
+ ## Citation
41
+
42
+ If you find this benchmark useful, please consider citing:
43
+
44
+ ```bibtex
45
+ @article{depth_anything_v2,
46
+ title={Depth Anything V2},
47
+ author={Yang, Lihe and Kang, Bingyi and Huang, Zilong and Zhao, Zhen and Xu, Xiaogang and Feng, Jiashi and Zhao, Hengshuang},
48
+ journal={arXiv:2406.09414},
49
+ year={2024}
50
+ }
51
+ ```
ddepth_anything_v2/LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
ddepth_anything_v2/README.md ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+ <h1>Depth Anything V2</h1>
3
+
4
+ [**Lihe Yang**](https://liheyoung.github.io/)<sup>1</sup> · [**Bingyi Kang**](https://bingykang.github.io/)<sup>2&dagger;</sup> · [**Zilong Huang**](http://speedinghzl.github.io/)<sup>2</sup>
5
+ <br>
6
+ [**Zhen Zhao**](http://zhaozhen.me/) · [**Xiaogang Xu**](https://xiaogang00.github.io/) · [**Jiashi Feng**](https://sites.google.com/site/jshfeng/)<sup>2</sup> · [**Hengshuang Zhao**](https://hszhao.github.io/)<sup>1*</sup>
7
+
8
+ <sup>1</sup>HKU&emsp;&emsp;&emsp;<sup>2</sup>TikTok
9
+ <br>
10
+ &dagger;project lead&emsp;*corresponding author
11
+ &dagger;[Bingyi Kang](https://bingykang.github.io/) proposed this project and advised in every aspect.
12
+
13
+ <a href="https://arxiv.org/abs/2406.09414"><img src='https://img.shields.io/badge/arXiv-Depth Anything V2-red' alt='Paper PDF'></a>
14
+ <a href='https://depth-anything-v2.github.io'><img src='https://img.shields.io/badge/Project_Page-Depth Anything V2-green' alt='Project Page'></a>
15
+ <a href='https://huggingface.co/spaces/depth-anything/Depth-Anything-V2'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Demo-blue'></a>
16
+ <a href='https://huggingface.co/datasets/depth-anything/DA-2K'><img src='https://img.shields.io/badge/Benchmark-DA--2K-yellow' alt='Benchmark'></a>
17
+ </div>
18
+
19
+ This work presents Depth Anything V2. It significantly outperforms [V1](https://github.com/LiheYoung/Depth-Anything) in fine-grained details and robustness. Compared with SD-based models, it enjoys faster inference speed, fewer parameters, and higher depth accuracy.
20
+
21
+ ![teaser](assets/teaser.png)
22
+
23
+
24
+ ## News
25
+
26
+ - **2024-07-06:** Depth Anything V2 is supported in [Transformers](https://github.com/huggingface/transformers/). See the [instructions](https://huggingface.co/docs/transformers/main/en/model_doc/depth_anything_v2) for convenient usage.
27
+ - **2024-06-25:** Depth Anything is integrated into [Apple Core ML Models](https://developer.apple.com/machine-learning/models/). See the instructions ([V1](https://huggingface.co/apple/coreml-depth-anything-small), [V2](https://huggingface.co/apple/coreml-depth-anything-v2-small)) for usage.
28
+ - **2024-06-22:** We release [smaller metric depth models](https://github.com/DepthAnything/Depth-Anything-V2/tree/main/metric_depth#pre-trained-models) based on Depth-Anything-V2-Small and Base.
29
+ - **2024-06-20:** Our repository and project page are flagged by GitHub and removed from the public for 6 days. Sorry for the inconvenience.
30
+ - **2024-06-14:** Paper, project page, code, models, demo, and benchmark are all released.
31
+
32
+
33
+ ## Pre-trained Models
34
+
35
+ We provide **four models** of varying scales for robust relative depth estimation:
36
+
37
+ | Model | Params | Checkpoint |
38
+ |:-|-:|:-:|
39
+ | Depth-Anything-V2-Small | 24.8M | [Download](https://huggingface.co/depth-anything/Depth-Anything-V2-Small/resolve/main/depth_anything_v2_vits.pth?download=true) |
40
+ | Depth-Anything-V2-Base | 97.5M | [Download](https://huggingface.co/depth-anything/Depth-Anything-V2-Base/resolve/main/depth_anything_v2_vitb.pth?download=true) |
41
+ | Depth-Anything-V2-Large | 335.3M | [Download](https://huggingface.co/depth-anything/Depth-Anything-V2-Large/resolve/main/depth_anything_v2_vitl.pth?download=true) |
42
+ | Depth-Anything-V2-Giant | 1.3B | Coming soon |
43
+
44
+
45
+ ## Usage
46
+
47
+ ### Prepraration
48
+
49
+ ```bash
50
+ git clone https://github.com/DepthAnything/Depth-Anything-V2
51
+ cd Depth-Anything-V2
52
+ pip install -r requirements.txt
53
+ ```
54
+
55
+ Download the checkpoints listed [here](#pre-trained-models) and put them under the `checkpoints` directory.
56
+
57
+ ### Use our models
58
+ ```python
59
+ import cv2
60
+ import torch
61
+
62
+ from depth_anything_v2.dpt import DepthAnythingV2
63
+
64
+ DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
65
+
66
+ model_configs = {
67
+ 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
68
+ 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
69
+ 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
70
+ 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
71
+ }
72
+
73
+ encoder = 'vitl' # or 'vits', 'vitb', 'vitg'
74
+
75
+ model = DepthAnythingV2(**model_configs[encoder])
76
+ model.load_state_dict(torch.load(f'checkpoints/depth_anything_v2_{encoder}.pth', map_location='cpu'))
77
+ model = model.to(DEVICE).eval()
78
+
79
+ raw_img = cv2.imread('your/image/path')
80
+ depth = model.infer_image(raw_img) # HxW raw depth map in numpy
81
+ ```
82
+
83
+ If you do not want to clone this repository, you can also load our models through [Transformers](https://github.com/huggingface/transformers/). Below is a simple code snippet. Please refer to the [official page](https://huggingface.co/docs/transformers/main/en/model_doc/depth_anything_v2) for more details.
84
+
85
+ - Note 1: Make sure you can connect to Hugging Face and have installed the latest Transformers.
86
+ - Note 2: Due to the [upsampling difference](https://github.com/huggingface/transformers/pull/31522#issuecomment-2184123463) between OpenCV (we used) and Pillow (HF used), predictions may differ slightly. So you are more recommended to use our models through the way introduced above.
87
+ ```python
88
+ from transformers import pipeline
89
+ from PIL import Image
90
+
91
+ pipe = pipeline(task="depth-estimation", model="depth-anything/Depth-Anything-V2-Small-hf")
92
+ image = Image.open('your/image/path')
93
+ depth = pipe(image)["depth"]
94
+ ```
95
+
96
+ ### Running script on *images*
97
+
98
+ ```bash
99
+ python run.py \
100
+ --encoder <vits | vitb | vitl | vitg> \
101
+ --img-path <path> --outdir <outdir> \
102
+ [--input-size <size>] [--pred-only] [--grayscale]
103
+ ```
104
+ Options:
105
+ - `--img-path`: You can either 1) point it to an image directory storing all interested images, 2) point it to a single image, or 3) point it to a text file storing all image paths.
106
+ - `--input-size` (optional): By default, we use input size `518` for model inference. ***You can increase the size for even more fine-grained results.***
107
+ - `--pred-only` (optional): Only save the predicted depth map, without raw image.
108
+ - `--grayscale` (optional): Save the grayscale depth map, without applying color palette.
109
+
110
+ For example:
111
+ ```bash
112
+ python run.py --encoder vitl --img-path assets/examples --outdir depth_vis
113
+ ```
114
+
115
+ ### Running script on *videos*
116
+
117
+ ```bash
118
+ python run_video.py \
119
+ --encoder <vits | vitb | vitl | vitg> \
120
+ --video-path assets/examples_video --outdir video_depth_vis \
121
+ [--input-size <size>] [--pred-only] [--grayscale]
122
+ ```
123
+
124
+ ***Our larger model has better temporal consistency on videos.***
125
+
126
+ ### Gradio demo
127
+
128
+ To use our gradio demo locally:
129
+
130
+ ```bash
131
+ python app.py
132
+ ```
133
+
134
+ You can also try our [online demo](https://huggingface.co/spaces/Depth-Anything/Depth-Anything-V2).
135
+
136
+ ***Note: Compared to V1, we have made a minor modification to the DINOv2-DPT architecture (originating from this [issue](https://github.com/LiheYoung/Depth-Anything/issues/81)).*** In V1, we *unintentionally* used features from the last four layers of DINOv2 for decoding. In V2, we use [intermediate features](https://github.com/DepthAnything/Depth-Anything-V2/blob/2cbc36a8ce2cec41d38ee51153f112e87c8e42d8/depth_anything_v2/dpt.py#L164-L169) instead. Although this modification did not improve details or accuracy, we decided to follow this common practice.
137
+
138
+
139
+ ## Fine-tuned to Metric Depth Estimation
140
+
141
+ Please refer to [metric depth estimation](./metric_depth).
142
+
143
+
144
+ ## DA-2K Evaluation Benchmark
145
+
146
+ Please refer to [DA-2K benchmark](./DA-2K.md).
147
+
148
+
149
+ ## Community Support
150
+
151
+ **We sincerely appreciate all the community support for our Depth Anything series. Thank you a lot!**
152
+
153
+ - Apple Core ML:
154
+ - https://developer.apple.com/machine-learning/models
155
+ - https://huggingface.co/apple/coreml-depth-anything-v2-small
156
+ - https://huggingface.co/apple/coreml-depth-anything-small
157
+ - Transformers:
158
+ - https://huggingface.co/docs/transformers/main/en/model_doc/depth_anything_v2
159
+ - https://huggingface.co/docs/transformers/main/en/model_doc/depth_anything
160
+ - TensorRT:
161
+ - https://github.com/spacewalk01/depth-anything-tensorrt
162
+ - https://github.com/zhujiajian98/Depth-Anythingv2-TensorRT-python
163
+ - ONNX: https://github.com/fabio-sim/Depth-Anything-ONNX
164
+ - ComfyUI: https://github.com/kijai/ComfyUI-DepthAnythingV2
165
+ - Transformers.js (real-time depth in web): https://huggingface.co/spaces/Xenova/webgpu-realtime-depth-estimation
166
+ - Android:
167
+ - https://github.com/shubham0204/Depth-Anything-Android
168
+ - https://github.com/FeiGeChuanShu/ncnn-android-depth_anything
169
+
170
+
171
+ ## Acknowledgement
172
+
173
+ We are sincerely grateful to the awesome Hugging Face team ([@Pedro Cuenca](https://huggingface.co/pcuenq), [@Niels Rogge](https://huggingface.co/nielsr), [@Merve Noyan](https://huggingface.co/merve), [@Amy Roberts](https://huggingface.co/amyeroberts), et al.) for their huge efforts in supporting our models in Transformers and Apple Core ML.
174
+
175
+ We also thank the [DINOv2](https://github.com/facebookresearch/dinov2) team for contributing such impressive models to our community.
176
+
177
+
178
+ ## LICENSE
179
+
180
+ Depth-Anything-V2-Small model is under the Apache-2.0 license. Depth-Anything-V2-Base/Large/Giant models are under the CC-BY-NC-4.0 license.
181
+
182
+
183
+ ## Citation
184
+
185
+ If you find this project useful, please consider citing:
186
+
187
+ ```bibtex
188
+ @article{depth_anything_v2,
189
+ title={Depth Anything V2},
190
+ author={Yang, Lihe and Kang, Bingyi and Huang, Zilong and Zhao, Zhen and Xu, Xiaogang and Feng, Jiashi and Zhao, Hengshuang},
191
+ journal={arXiv:2406.09414},
192
+ year={2024}
193
+ }
194
+
195
+ @inproceedings{depth_anything_v1,
196
+ title={Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data},
197
+ author={Yang, Lihe and Kang, Bingyi and Huang, Zilong and Xu, Xiaogang and Feng, Jiashi and Zhao, Hengshuang},
198
+ booktitle={CVPR},
199
+ year={2024}
200
+ }
201
+ ```
ddepth_anything_v2/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .depth_anything_v2.dpt import DepthAnythingV2
ddepth_anything_v2/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (237 Bytes). View file
 
ddepth_anything_v2/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (267 Bytes). View file
 
ddepth_anything_v2/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (244 Bytes). View file
 
ddepth_anything_v2/app.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import gradio as gr
3
+ import matplotlib
4
+ import numpy as np
5
+ from PIL import Image
6
+ import torch
7
+ import tempfile
8
+ from gradio_imageslider import ImageSlider
9
+
10
+ from depth_anything_v2.dpt import DepthAnythingV2
11
+
12
+ css = """
13
+ #img-display-container {
14
+ max-height: 100vh;
15
+ }
16
+ #img-display-input {
17
+ max-height: 80vh;
18
+ }
19
+ #img-display-output {
20
+ max-height: 80vh;
21
+ }
22
+ #download {
23
+ height: 62px;
24
+ }
25
+ """
26
+ DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
27
+ model_configs = {
28
+ 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
29
+ 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
30
+ 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
31
+ 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
32
+ }
33
+ encoder = 'vitl'
34
+ model = DepthAnythingV2(**model_configs[encoder])
35
+ state_dict = torch.load(f'checkpoints/depth_anything_v2_{encoder}.pth', map_location="cpu")
36
+ model.load_state_dict(state_dict)
37
+ model = model.to(DEVICE).eval()
38
+
39
+ title = "# Depth Anything V2"
40
+ description = """Official demo for **Depth Anything V2**.
41
+ Please refer to our [paper](https://arxiv.org/abs/2406.09414), [project page](https://depth-anything-v2.github.io), or [github](https://github.com/DepthAnything/Depth-Anything-V2) for more details."""
42
+
43
+ def predict_depth(image):
44
+ return model.infer_image(image)
45
+
46
+ with gr.Blocks(css=css) as demo:
47
+ gr.Markdown(title)
48
+ gr.Markdown(description)
49
+ gr.Markdown("### Depth Prediction demo")
50
+
51
+ with gr.Row():
52
+ input_image = gr.Image(label="Input Image", type='numpy', elem_id='img-display-input')
53
+ depth_image_slider = ImageSlider(label="Depth Map with Slider View", elem_id='img-display-output', position=0.5)
54
+ submit = gr.Button(value="Compute Depth")
55
+ gray_depth_file = gr.File(label="Grayscale depth map", elem_id="download",)
56
+ raw_file = gr.File(label="16-bit raw output (can be considered as disparity)", elem_id="download",)
57
+
58
+ cmap = matplotlib.colormaps.get_cmap('Spectral_r')
59
+
60
+ def on_submit(image):
61
+ original_image = image.copy()
62
+
63
+ h, w = image.shape[:2]
64
+
65
+ depth = predict_depth(image[:, :, ::-1])
66
+
67
+ raw_depth = Image.fromarray(depth.astype('uint16'))
68
+ tmp_raw_depth = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
69
+ raw_depth.save(tmp_raw_depth.name)
70
+
71
+ depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
72
+ depth = depth.astype(np.uint8)
73
+ colored_depth = (cmap(depth)[:, :, :3] * 255).astype(np.uint8)
74
+
75
+ gray_depth = Image.fromarray(depth)
76
+ tmp_gray_depth = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
77
+ gray_depth.save(tmp_gray_depth.name)
78
+
79
+ return [(original_image, colored_depth), tmp_gray_depth.name, tmp_raw_depth.name]
80
+
81
+ submit.click(on_submit, inputs=[input_image], outputs=[depth_image_slider, gray_depth_file, raw_file])
82
+
83
+ example_files = glob.glob('assets/examples/*')
84
+ examples = gr.Examples(examples=example_files, inputs=[input_image], outputs=[depth_image_slider, gray_depth_file, raw_file], fn=on_submit)
85
+
86
+
87
+ if __name__ == '__main__':
88
+ demo.queue().launch()
ddepth_anything_v2/depth_anything_v2/__pycache__/dinov2.cpython-310.pyc ADDED
Binary file (12.2 kB). View file
 
ddepth_anything_v2/depth_anything_v2/__pycache__/dinov2.cpython-311.pyc ADDED
Binary file (21.8 kB). View file
 
ddepth_anything_v2/depth_anything_v2/__pycache__/dinov2.cpython-312.pyc ADDED
Binary file (18.7 kB). View file
 
ddepth_anything_v2/depth_anything_v2/__pycache__/dpt.cpython-310.pyc ADDED
Binary file (5.99 kB). View file
 
ddepth_anything_v2/depth_anything_v2/__pycache__/dpt.cpython-311.pyc ADDED
Binary file (11.8 kB). View file
 
ddepth_anything_v2/depth_anything_v2/__pycache__/dpt.cpython-312.pyc ADDED
Binary file (10.7 kB). View file
 
ddepth_anything_v2/depth_anything_v2/dinov2.py ADDED
@@ -0,0 +1,415 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ # References:
7
+ # https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
8
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
9
+
10
+ from functools import partial
11
+ import math
12
+ import logging
13
+ from typing import Sequence, Tuple, Union, Callable
14
+
15
+ import torch
16
+ import torch.nn as nn
17
+ import torch.utils.checkpoint
18
+ from torch.nn.init import trunc_normal_
19
+
20
+ from .dinov2_layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
21
+
22
+
23
+ logger = logging.getLogger("dinov2")
24
+
25
+
26
+ def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
27
+ if not depth_first and include_root:
28
+ fn(module=module, name=name)
29
+ for child_name, child_module in module.named_children():
30
+ child_name = ".".join((name, child_name)) if name else child_name
31
+ named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
32
+ if depth_first and include_root:
33
+ fn(module=module, name=name)
34
+ return module
35
+
36
+
37
+ class BlockChunk(nn.ModuleList):
38
+ def forward(self, x):
39
+ for b in self:
40
+ x = b(x)
41
+ return x
42
+
43
+
44
+ class DinoVisionTransformer(nn.Module):
45
+ def __init__(
46
+ self,
47
+ img_size=224,
48
+ patch_size=16,
49
+ in_chans=3,
50
+ embed_dim=768,
51
+ depth=12,
52
+ num_heads=12,
53
+ mlp_ratio=4.0,
54
+ qkv_bias=True,
55
+ ffn_bias=True,
56
+ proj_bias=True,
57
+ drop_path_rate=0.0,
58
+ drop_path_uniform=False,
59
+ init_values=None, # for layerscale: None or 0 => no layerscale
60
+ embed_layer=PatchEmbed,
61
+ act_layer=nn.GELU,
62
+ block_fn=Block,
63
+ ffn_layer="mlp",
64
+ block_chunks=1,
65
+ num_register_tokens=0,
66
+ interpolate_antialias=False,
67
+ interpolate_offset=0.1,
68
+ ):
69
+ """
70
+ Args:
71
+ img_size (int, tuple): input image size
72
+ patch_size (int, tuple): patch size
73
+ in_chans (int): number of input channels
74
+ embed_dim (int): embedding dimension
75
+ depth (int): depth of transformer
76
+ num_heads (int): number of attention heads
77
+ mlp_ratio (int): ratio of mlp hidden dim to embedding dim
78
+ qkv_bias (bool): enable bias for qkv if True
79
+ proj_bias (bool): enable bias for proj in attn if True
80
+ ffn_bias (bool): enable bias for ffn if True
81
+ drop_path_rate (float): stochastic depth rate
82
+ drop_path_uniform (bool): apply uniform drop rate across blocks
83
+ weight_init (str): weight init scheme
84
+ init_values (float): layer-scale init values
85
+ embed_layer (nn.Module): patch embedding layer
86
+ act_layer (nn.Module): MLP activation layer
87
+ block_fn (nn.Module): transformer block class
88
+ ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
89
+ block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
90
+ num_register_tokens: (int) number of extra cls tokens (so-called "registers")
91
+ interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
92
+ interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
93
+ """
94
+ super().__init__()
95
+ norm_layer = partial(nn.LayerNorm, eps=1e-6)
96
+
97
+ self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
98
+ self.num_tokens = 1
99
+ self.n_blocks = depth
100
+ self.num_heads = num_heads
101
+ self.patch_size = patch_size
102
+ self.num_register_tokens = num_register_tokens
103
+ self.interpolate_antialias = interpolate_antialias
104
+ self.interpolate_offset = interpolate_offset
105
+
106
+ self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
107
+ num_patches = self.patch_embed.num_patches
108
+
109
+ self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
110
+ self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
111
+ assert num_register_tokens >= 0
112
+ self.register_tokens = (
113
+ nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
114
+ )
115
+
116
+ if drop_path_uniform is True:
117
+ dpr = [drop_path_rate] * depth
118
+ else:
119
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
120
+
121
+ if ffn_layer == "mlp":
122
+ logger.info("using MLP layer as FFN")
123
+ ffn_layer = Mlp
124
+ elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
125
+ logger.info("using SwiGLU layer as FFN")
126
+ ffn_layer = SwiGLUFFNFused
127
+ elif ffn_layer == "identity":
128
+ logger.info("using Identity layer as FFN")
129
+
130
+ def f(*args, **kwargs):
131
+ return nn.Identity()
132
+
133
+ ffn_layer = f
134
+ else:
135
+ raise NotImplementedError
136
+
137
+ blocks_list = [
138
+ block_fn(
139
+ dim=embed_dim,
140
+ num_heads=num_heads,
141
+ mlp_ratio=mlp_ratio,
142
+ qkv_bias=qkv_bias,
143
+ proj_bias=proj_bias,
144
+ ffn_bias=ffn_bias,
145
+ drop_path=dpr[i],
146
+ norm_layer=norm_layer,
147
+ act_layer=act_layer,
148
+ ffn_layer=ffn_layer,
149
+ init_values=init_values,
150
+ )
151
+ for i in range(depth)
152
+ ]
153
+ if block_chunks > 0:
154
+ self.chunked_blocks = True
155
+ chunked_blocks = []
156
+ chunksize = depth // block_chunks
157
+ for i in range(0, depth, chunksize):
158
+ # this is to keep the block index consistent if we chunk the block list
159
+ chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
160
+ self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
161
+ else:
162
+ self.chunked_blocks = False
163
+ self.blocks = nn.ModuleList(blocks_list)
164
+
165
+ self.norm = norm_layer(embed_dim)
166
+ self.head = nn.Identity()
167
+
168
+ self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
169
+
170
+ self.init_weights()
171
+
172
+ def init_weights(self):
173
+ trunc_normal_(self.pos_embed, std=0.02)
174
+ nn.init.normal_(self.cls_token, std=1e-6)
175
+ if self.register_tokens is not None:
176
+ nn.init.normal_(self.register_tokens, std=1e-6)
177
+ named_apply(init_weights_vit_timm, self)
178
+
179
+ def interpolate_pos_encoding(self, x, w, h):
180
+ previous_dtype = x.dtype
181
+ npatch = x.shape[1] - 1
182
+ N = self.pos_embed.shape[1] - 1
183
+ if npatch == N and w == h:
184
+ return self.pos_embed
185
+ pos_embed = self.pos_embed.float()
186
+ class_pos_embed = pos_embed[:, 0]
187
+ patch_pos_embed = pos_embed[:, 1:]
188
+ dim = x.shape[-1]
189
+ w0 = w // self.patch_size
190
+ h0 = h // self.patch_size
191
+ # we add a small number to avoid floating point error in the interpolation
192
+ # see discussion at https://github.com/facebookresearch/dino/issues/8
193
+ # DINOv2 with register modify the interpolate_offset from 0.1 to 0.0
194
+ w0, h0 = w0 + self.interpolate_offset, h0 + self.interpolate_offset
195
+ # w0, h0 = w0 + 0.1, h0 + 0.1
196
+
197
+ sqrt_N = math.sqrt(N)
198
+ sx, sy = float(w0) / sqrt_N, float(h0) / sqrt_N
199
+ patch_pos_embed = nn.functional.interpolate(
200
+ patch_pos_embed.reshape(1, int(sqrt_N), int(sqrt_N), dim).permute(0, 3, 1, 2),
201
+ scale_factor=(sx, sy),
202
+ # (int(w0), int(h0)), # to solve the upsampling shape issue
203
+ mode="bicubic",
204
+ antialias=self.interpolate_antialias
205
+ )
206
+
207
+ assert int(w0) == patch_pos_embed.shape[-2]
208
+ assert int(h0) == patch_pos_embed.shape[-1]
209
+ patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
210
+ return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
211
+
212
+ def prepare_tokens_with_masks(self, x, masks=None):
213
+ B, nc, w, h = x.shape
214
+ x = self.patch_embed(x)
215
+ if masks is not None:
216
+ x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
217
+
218
+ x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
219
+ x = x + self.interpolate_pos_encoding(x, w, h)
220
+
221
+ if self.register_tokens is not None:
222
+ x = torch.cat(
223
+ (
224
+ x[:, :1],
225
+ self.register_tokens.expand(x.shape[0], -1, -1),
226
+ x[:, 1:],
227
+ ),
228
+ dim=1,
229
+ )
230
+
231
+ return x
232
+
233
+ def forward_features_list(self, x_list, masks_list):
234
+ x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
235
+ for blk in self.blocks:
236
+ x = blk(x)
237
+
238
+ all_x = x
239
+ output = []
240
+ for x, masks in zip(all_x, masks_list):
241
+ x_norm = self.norm(x)
242
+ output.append(
243
+ {
244
+ "x_norm_clstoken": x_norm[:, 0],
245
+ "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
246
+ "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
247
+ "x_prenorm": x,
248
+ "masks": masks,
249
+ }
250
+ )
251
+ return output
252
+
253
+ def forward_features(self, x, masks=None):
254
+ if isinstance(x, list):
255
+ return self.forward_features_list(x, masks)
256
+
257
+ x = self.prepare_tokens_with_masks(x, masks)
258
+
259
+ for blk in self.blocks:
260
+ x = blk(x)
261
+
262
+ x_norm = self.norm(x)
263
+ return {
264
+ "x_norm_clstoken": x_norm[:, 0],
265
+ "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
266
+ "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
267
+ "x_prenorm": x,
268
+ "masks": masks,
269
+ }
270
+
271
+ def _get_intermediate_layers_not_chunked(self, x, n=1):
272
+ x = self.prepare_tokens_with_masks(x)
273
+ # If n is an int, take the n last blocks. If it's a list, take them
274
+ output, total_block_len = [], len(self.blocks)
275
+ blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
276
+ for i, blk in enumerate(self.blocks):
277
+ x = blk(x)
278
+ if i in blocks_to_take:
279
+ output.append(x)
280
+ assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
281
+ return output
282
+
283
+ def _get_intermediate_layers_chunked(self, x, n=1):
284
+ x = self.prepare_tokens_with_masks(x)
285
+ output, i, total_block_len = [], 0, len(self.blocks[-1])
286
+ # If n is an int, take the n last blocks. If it's a list, take them
287
+ blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
288
+ for block_chunk in self.blocks:
289
+ for blk in block_chunk[i:]: # Passing the nn.Identity()
290
+ x = blk(x)
291
+ if i in blocks_to_take:
292
+ output.append(x)
293
+ i += 1
294
+ assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
295
+ return output
296
+
297
+ def get_intermediate_layers(
298
+ self,
299
+ x: torch.Tensor,
300
+ n: Union[int, Sequence] = 1, # Layers or n last layers to take
301
+ reshape: bool = False,
302
+ return_class_token: bool = False,
303
+ norm=True
304
+ ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
305
+ if self.chunked_blocks:
306
+ outputs = self._get_intermediate_layers_chunked(x, n)
307
+ else:
308
+ outputs = self._get_intermediate_layers_not_chunked(x, n)
309
+ if norm:
310
+ outputs = [self.norm(out) for out in outputs]
311
+ class_tokens = [out[:, 0] for out in outputs]
312
+ outputs = [out[:, 1 + self.num_register_tokens:] for out in outputs]
313
+ if reshape:
314
+ B, _, w, h = x.shape
315
+ outputs = [
316
+ out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
317
+ for out in outputs
318
+ ]
319
+ if return_class_token:
320
+ return tuple(zip(outputs, class_tokens))
321
+ return tuple(outputs)
322
+
323
+ def forward(self, *args, is_training=False, **kwargs):
324
+ ret = self.forward_features(*args, **kwargs)
325
+ if is_training:
326
+ return ret
327
+ else:
328
+ return self.head(ret["x_norm_clstoken"])
329
+
330
+
331
+ def init_weights_vit_timm(module: nn.Module, name: str = ""):
332
+ """ViT weight initialization, original timm impl (for reproducibility)"""
333
+ if isinstance(module, nn.Linear):
334
+ trunc_normal_(module.weight, std=0.02)
335
+ if module.bias is not None:
336
+ nn.init.zeros_(module.bias)
337
+
338
+
339
+ def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
340
+ model = DinoVisionTransformer(
341
+ patch_size=patch_size,
342
+ embed_dim=384,
343
+ depth=12,
344
+ num_heads=6,
345
+ mlp_ratio=4,
346
+ block_fn=partial(Block, attn_class=MemEffAttention),
347
+ num_register_tokens=num_register_tokens,
348
+ **kwargs,
349
+ )
350
+ return model
351
+
352
+
353
+ def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
354
+ model = DinoVisionTransformer(
355
+ patch_size=patch_size,
356
+ embed_dim=768,
357
+ depth=12,
358
+ num_heads=12,
359
+ mlp_ratio=4,
360
+ block_fn=partial(Block, attn_class=MemEffAttention),
361
+ num_register_tokens=num_register_tokens,
362
+ **kwargs,
363
+ )
364
+ return model
365
+
366
+
367
+ def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
368
+ model = DinoVisionTransformer(
369
+ patch_size=patch_size,
370
+ embed_dim=1024,
371
+ depth=24,
372
+ num_heads=16,
373
+ mlp_ratio=4,
374
+ block_fn=partial(Block, attn_class=MemEffAttention),
375
+ num_register_tokens=num_register_tokens,
376
+ **kwargs,
377
+ )
378
+ return model
379
+
380
+
381
+ def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
382
+ """
383
+ Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
384
+ """
385
+ model = DinoVisionTransformer(
386
+ patch_size=patch_size,
387
+ embed_dim=1536,
388
+ depth=40,
389
+ num_heads=24,
390
+ mlp_ratio=4,
391
+ block_fn=partial(Block, attn_class=MemEffAttention),
392
+ num_register_tokens=num_register_tokens,
393
+ **kwargs,
394
+ )
395
+ return model
396
+
397
+
398
+ def DINOv2(model_name):
399
+ model_zoo = {
400
+ "vits": vit_small,
401
+ "vitb": vit_base,
402
+ "vitl": vit_large,
403
+ "vitg": vit_giant2
404
+ }
405
+
406
+ return model_zoo[model_name](
407
+ img_size=518,
408
+ patch_size=14,
409
+ init_values=1.0,
410
+ ffn_layer="mlp" if model_name != "vitg" else "swiglufused",
411
+ block_chunks=0,
412
+ num_register_tokens=0,
413
+ interpolate_antialias=False,
414
+ interpolate_offset=0.1
415
+ )
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ from .mlp import Mlp
8
+ from .patch_embed import PatchEmbed
9
+ from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
10
+ from .block import NestedTensorBlock
11
+ from .attention import MemEffAttention
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (450 Bytes). View file
 
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (552 Bytes). View file
 
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (463 Bytes). View file
 
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/attention.cpython-310.pyc ADDED
Binary file (2.42 kB). View file
 
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/attention.cpython-311.pyc ADDED
Binary file (4.51 kB). View file
 
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/attention.cpython-312.pyc ADDED
Binary file (3.97 kB). View file
 
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/block.cpython-310.pyc ADDED
Binary file (8.02 kB). View file
 
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/block.cpython-311.pyc ADDED
Binary file (15.5 kB). View file
 
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/block.cpython-312.pyc ADDED
Binary file (13.1 kB). View file
 
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/drop_path.cpython-310.pyc ADDED
Binary file (1.25 kB). View file
 
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/drop_path.cpython-311.pyc ADDED
Binary file (1.9 kB). View file
 
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/drop_path.cpython-312.pyc ADDED
Binary file (1.68 kB). View file
 
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/layer_scale.cpython-310.pyc ADDED
Binary file (1.05 kB). View file
 
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/layer_scale.cpython-311.pyc ADDED
Binary file (1.66 kB). View file
 
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/layer_scale.cpython-312.pyc ADDED
Binary file (1.44 kB). View file
 
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/mlp.cpython-310.pyc ADDED
Binary file (1.24 kB). View file
 
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/mlp.cpython-311.pyc ADDED
Binary file (2.12 kB). View file
 
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/mlp.cpython-312.pyc ADDED
Binary file (1.87 kB). View file
 
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/patch_embed.cpython-310.pyc ADDED
Binary file (2.69 kB). View file
 
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/patch_embed.cpython-311.pyc ADDED
Binary file (4.49 kB). View file
 
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/patch_embed.cpython-312.pyc ADDED
Binary file (4.08 kB). View file
 
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/swiglu_ffn.cpython-310.pyc ADDED
Binary file (2.04 kB). View file
 
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/swiglu_ffn.cpython-311.pyc ADDED
Binary file (3.33 kB). View file
 
ddepth_anything_v2/depth_anything_v2/dinov2_layers/__pycache__/swiglu_ffn.cpython-312.pyc ADDED
Binary file (2.86 kB). View file
 
ddepth_anything_v2/depth_anything_v2/dinov2_layers/attention.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # References:
8
+ # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
9
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
10
+
11
+ import logging
12
+
13
+ from torch import Tensor
14
+ from torch import nn
15
+
16
+
17
+ logger = logging.getLogger("dinov2")
18
+
19
+
20
+ try:
21
+ from xformers.ops import memory_efficient_attention, unbind, fmha
22
+
23
+ XFORMERS_AVAILABLE = True
24
+ except ImportError:
25
+ logger.warning("xFormers not available")
26
+ XFORMERS_AVAILABLE = False
27
+
28
+
29
+ class Attention(nn.Module):
30
+ def __init__(
31
+ self,
32
+ dim: int,
33
+ num_heads: int = 8,
34
+ qkv_bias: bool = False,
35
+ proj_bias: bool = True,
36
+ attn_drop: float = 0.0,
37
+ proj_drop: float = 0.0,
38
+ ) -> None:
39
+ super().__init__()
40
+ self.num_heads = num_heads
41
+ head_dim = dim // num_heads
42
+ self.scale = head_dim**-0.5
43
+
44
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
45
+ self.attn_drop = nn.Dropout(attn_drop)
46
+ self.proj = nn.Linear(dim, dim, bias=proj_bias)
47
+ self.proj_drop = nn.Dropout(proj_drop)
48
+
49
+ def forward(self, x: Tensor) -> Tensor:
50
+ B, N, C = x.shape
51
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
52
+
53
+ q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
54
+ attn = q @ k.transpose(-2, -1)
55
+
56
+ attn = attn.softmax(dim=-1)
57
+ attn = self.attn_drop(attn)
58
+
59
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
60
+ x = self.proj(x)
61
+ x = self.proj_drop(x)
62
+ return x
63
+
64
+
65
+ class MemEffAttention(Attention):
66
+ def forward(self, x: Tensor, attn_bias=None) -> Tensor:
67
+ if not XFORMERS_AVAILABLE:
68
+ assert attn_bias is None, "xFormers is required for nested tensors usage"
69
+ return super().forward(x)
70
+
71
+ B, N, C = x.shape
72
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
73
+
74
+ q, k, v = unbind(qkv, 2)
75
+
76
+ x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
77
+ x = x.reshape([B, N, C])
78
+
79
+ x = self.proj(x)
80
+ x = self.proj_drop(x)
81
+ return x
82
+
83
+
ddepth_anything_v2/depth_anything_v2/dinov2_layers/block.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # References:
8
+ # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
9
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
10
+
11
+ import logging
12
+ from typing import Callable, List, Any, Tuple, Dict
13
+
14
+ import torch
15
+ from torch import nn, Tensor
16
+
17
+ from .attention import Attention, MemEffAttention
18
+ from .drop_path import DropPath
19
+ from .layer_scale import LayerScale
20
+ from .mlp import Mlp
21
+
22
+
23
+ logger = logging.getLogger("dinov2")
24
+
25
+
26
+ try:
27
+ from xformers.ops import fmha
28
+ from xformers.ops import scaled_index_add, index_select_cat
29
+
30
+ XFORMERS_AVAILABLE = True
31
+ except ImportError:
32
+ logger.warning("xFormers not available")
33
+ XFORMERS_AVAILABLE = False
34
+
35
+
36
+ class Block(nn.Module):
37
+ def __init__(
38
+ self,
39
+ dim: int,
40
+ num_heads: int,
41
+ mlp_ratio: float = 4.0,
42
+ qkv_bias: bool = False,
43
+ proj_bias: bool = True,
44
+ ffn_bias: bool = True,
45
+ drop: float = 0.0,
46
+ attn_drop: float = 0.0,
47
+ init_values=None,
48
+ drop_path: float = 0.0,
49
+ act_layer: Callable[..., nn.Module] = nn.GELU,
50
+ norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
51
+ attn_class: Callable[..., nn.Module] = Attention,
52
+ ffn_layer: Callable[..., nn.Module] = Mlp,
53
+ ) -> None:
54
+ super().__init__()
55
+ # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
56
+ self.norm1 = norm_layer(dim)
57
+ self.attn = attn_class(
58
+ dim,
59
+ num_heads=num_heads,
60
+ qkv_bias=qkv_bias,
61
+ proj_bias=proj_bias,
62
+ attn_drop=attn_drop,
63
+ proj_drop=drop,
64
+ )
65
+ self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
66
+ self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
67
+
68
+ self.norm2 = norm_layer(dim)
69
+ mlp_hidden_dim = int(dim * mlp_ratio)
70
+ self.mlp = ffn_layer(
71
+ in_features=dim,
72
+ hidden_features=mlp_hidden_dim,
73
+ act_layer=act_layer,
74
+ drop=drop,
75
+ bias=ffn_bias,
76
+ )
77
+ self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
78
+ self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
79
+
80
+ self.sample_drop_ratio = drop_path
81
+
82
+ def forward(self, x: Tensor) -> Tensor:
83
+ def attn_residual_func(x: Tensor) -> Tensor:
84
+ return self.ls1(self.attn(self.norm1(x)))
85
+
86
+ def ffn_residual_func(x: Tensor) -> Tensor:
87
+ return self.ls2(self.mlp(self.norm2(x)))
88
+
89
+ if self.training and self.sample_drop_ratio > 0.1:
90
+ # the overhead is compensated only for a drop path rate larger than 0.1
91
+ x = drop_add_residual_stochastic_depth(
92
+ x,
93
+ residual_func=attn_residual_func,
94
+ sample_drop_ratio=self.sample_drop_ratio,
95
+ )
96
+ x = drop_add_residual_stochastic_depth(
97
+ x,
98
+ residual_func=ffn_residual_func,
99
+ sample_drop_ratio=self.sample_drop_ratio,
100
+ )
101
+ elif self.training and self.sample_drop_ratio > 0.0:
102
+ x = x + self.drop_path1(attn_residual_func(x))
103
+ x = x + self.drop_path1(ffn_residual_func(x)) # FIXME: drop_path2
104
+ else:
105
+ x = x + attn_residual_func(x)
106
+ x = x + ffn_residual_func(x)
107
+ return x
108
+
109
+
110
+ def drop_add_residual_stochastic_depth(
111
+ x: Tensor,
112
+ residual_func: Callable[[Tensor], Tensor],
113
+ sample_drop_ratio: float = 0.0,
114
+ ) -> Tensor:
115
+ # 1) extract subset using permutation
116
+ b, n, d = x.shape
117
+ sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
118
+ brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
119
+ x_subset = x[brange]
120
+
121
+ # 2) apply residual_func to get residual
122
+ residual = residual_func(x_subset)
123
+
124
+ x_flat = x.flatten(1)
125
+ residual = residual.flatten(1)
126
+
127
+ residual_scale_factor = b / sample_subset_size
128
+
129
+ # 3) add the residual
130
+ x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
131
+ return x_plus_residual.view_as(x)
132
+
133
+
134
+ def get_branges_scales(x, sample_drop_ratio=0.0):
135
+ b, n, d = x.shape
136
+ sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
137
+ brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
138
+ residual_scale_factor = b / sample_subset_size
139
+ return brange, residual_scale_factor
140
+
141
+
142
+ def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
143
+ if scaling_vector is None:
144
+ x_flat = x.flatten(1)
145
+ residual = residual.flatten(1)
146
+ x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
147
+ else:
148
+ x_plus_residual = scaled_index_add(
149
+ x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
150
+ )
151
+ return x_plus_residual
152
+
153
+
154
+ attn_bias_cache: Dict[Tuple, Any] = {}
155
+
156
+
157
+ def get_attn_bias_and_cat(x_list, branges=None):
158
+ """
159
+ this will perform the index select, cat the tensors, and provide the attn_bias from cache
160
+ """
161
+ batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
162
+ all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
163
+ if all_shapes not in attn_bias_cache.keys():
164
+ seqlens = []
165
+ for b, x in zip(batch_sizes, x_list):
166
+ for _ in range(b):
167
+ seqlens.append(x.shape[1])
168
+ attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
169
+ attn_bias._batch_sizes = batch_sizes
170
+ attn_bias_cache[all_shapes] = attn_bias
171
+
172
+ if branges is not None:
173
+ cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
174
+ else:
175
+ tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
176
+ cat_tensors = torch.cat(tensors_bs1, dim=1)
177
+
178
+ return attn_bias_cache[all_shapes], cat_tensors
179
+
180
+
181
+ def drop_add_residual_stochastic_depth_list(
182
+ x_list: List[Tensor],
183
+ residual_func: Callable[[Tensor, Any], Tensor],
184
+ sample_drop_ratio: float = 0.0,
185
+ scaling_vector=None,
186
+ ) -> Tensor:
187
+ # 1) generate random set of indices for dropping samples in the batch
188
+ branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
189
+ branges = [s[0] for s in branges_scales]
190
+ residual_scale_factors = [s[1] for s in branges_scales]
191
+
192
+ # 2) get attention bias and index+concat the tensors
193
+ attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
194
+
195
+ # 3) apply residual_func to get residual, and split the result
196
+ residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias)) # type: ignore
197
+
198
+ outputs = []
199
+ for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
200
+ outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
201
+ return outputs
202
+
203
+
204
+ class NestedTensorBlock(Block):
205
+ def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
206
+ """
207
+ x_list contains a list of tensors to nest together and run
208
+ """
209
+ assert isinstance(self.attn, MemEffAttention)
210
+
211
+ if self.training and self.sample_drop_ratio > 0.0:
212
+
213
+ def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
214
+ return self.attn(self.norm1(x), attn_bias=attn_bias)
215
+
216
+ def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
217
+ return self.mlp(self.norm2(x))
218
+
219
+ x_list = drop_add_residual_stochastic_depth_list(
220
+ x_list,
221
+ residual_func=attn_residual_func,
222
+ sample_drop_ratio=self.sample_drop_ratio,
223
+ scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
224
+ )
225
+ x_list = drop_add_residual_stochastic_depth_list(
226
+ x_list,
227
+ residual_func=ffn_residual_func,
228
+ sample_drop_ratio=self.sample_drop_ratio,
229
+ scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
230
+ )
231
+ return x_list
232
+ else:
233
+
234
+ def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
235
+ return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
236
+
237
+ def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
238
+ return self.ls2(self.mlp(self.norm2(x)))
239
+
240
+ attn_bias, x = get_attn_bias_and_cat(x_list)
241
+ x = x + attn_residual_func(x, attn_bias=attn_bias)
242
+ x = x + ffn_residual_func(x)
243
+ return attn_bias.split(x)
244
+
245
+ def forward(self, x_or_x_list):
246
+ if isinstance(x_or_x_list, Tensor):
247
+ return super().forward(x_or_x_list)
248
+ elif isinstance(x_or_x_list, list):
249
+ assert XFORMERS_AVAILABLE, "Please install xFormers for nested tensors usage"
250
+ return self.forward_nested(x_or_x_list)
251
+ else:
252
+ raise AssertionError
ddepth_anything_v2/depth_anything_v2/dinov2_layers/drop_path.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # References:
8
+ # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
9
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
10
+
11
+
12
+ from torch import nn
13
+
14
+
15
+ def drop_path(x, drop_prob: float = 0.0, training: bool = False):
16
+ if drop_prob == 0.0 or not training:
17
+ return x
18
+ keep_prob = 1 - drop_prob
19
+ shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
20
+ random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
21
+ if keep_prob > 0.0:
22
+ random_tensor.div_(keep_prob)
23
+ output = x * random_tensor
24
+ return output
25
+
26
+
27
+ class DropPath(nn.Module):
28
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
29
+
30
+ def __init__(self, drop_prob=None):
31
+ super(DropPath, self).__init__()
32
+ self.drop_prob = drop_prob
33
+
34
+ def forward(self, x):
35
+ return drop_path(x, self.drop_prob, self.training)