Norod78 commited on
Commit
7cf7150
Β·
verified Β·
1 Parent(s): a5733f0

DepthAnything_v2-Small-CoreML

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ sample_images/IMG_4061.jpeg filter=lfs diff=lfs merge=lfs -text
37
+ sample_images/Xcode_Preview_DepthAnything_v2_Small_518x392_Landscape.jpg filter=lfs diff=lfs merge=lfs -text
38
+ sample_images/Xcode_Preview_DepthAnything_v2-Large.jpg filter=lfs diff=lfs merge=lfs -text
DepthAnything_v2-Small518x518_Box_iPhone16ProMax.mlperf/report.json ADDED
The diff for this file is too large to render. See raw diff
 
DepthAnything_v2_Small_518x392_Landscape-iPhone16ProMax.mlperf/report.json ADDED
The diff for this file is too large to render. See raw diff
 
DepthAnything_v2_Small_518x392_Landscape.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:458cf8d7e4452aa244bf38afc18a0022faf30dd702cf83bf10e3e29a9945cf58
3
+ size 193409
DepthAnything_v2_Small_518x392_Landscape.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14b9132bafdcf2326a59143679e99756f4e5968fa4d0cf56a70426bc9c3ef5eb
3
+ size 49435968
DepthAnything_v2_Small_518x392_Landscape.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "0A4C5ADD-47E5-4249-82B0-7F5FDA6E3A15": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Weights",
7
+ "name": "weights",
8
+ "path": "com.apple.CoreML/weights"
9
+ },
10
+ "2F6A228C-105C-49CE-B330-6A07AA8C9427": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "2F6A228C-105C-49CE-B330-6A07AA8C9427"
18
+ }
DepthAnything_v2_Small_518x518_Box.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3b7a06a2f8001979451aa5aeed21051a86260d2c34eb35f328d210f121814d8
3
+ size 191982
DepthAnything_v2_Small_518x518_Box.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12b40d8ca366d162aaffa34a87d8b9c0b6794a915f90f34374d939dc8181bb24
3
+ size 49436736
DepthAnything_v2_Small_518x518_Box.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "A54D6005-5C99-4A1E-B946-1B66EB42E8FC": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Weights",
7
+ "name": "weights",
8
+ "path": "com.apple.CoreML/weights"
9
+ },
10
+ "A87F1A2E-FD73-4076-B06B-8E1C87937CF6": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "A87F1A2E-FD73-4076-B06B-8E1C87937CF6"
18
+ }
PyTorch2CoreML-dpt.ipynb ADDED
@@ -0,0 +1,489 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "1e99de7a",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "#!git clone https://huggingface.co/spaces/depth-anything/Depth-Anything-V2\n",
11
+ "#!pip install -r Depth-Anything-V2/requirements.txt\n",
12
+ "#!pip install -q --upgrade coremltools\n",
13
+ "#!cp ./patch_dinov2.diff Depth-Anything-V2/\n",
14
+ "#!cd Depth-Anything-V2 && git apply patch_dinov2.diff\n",
15
+ "#!cd .."
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "code",
20
+ "execution_count": 2,
21
+ "id": "d6cb8a61",
22
+ "metadata": {},
23
+ "outputs": [],
24
+ "source": [
25
+ "import os\n",
26
+ "os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": 3,
32
+ "id": "801db364",
33
+ "metadata": {},
34
+ "outputs": [
35
+ {
36
+ "name": "stderr",
37
+ "output_type": "stream",
38
+ "text": [
39
+ "scikit-learn version 1.6.0 is not supported. Minimum required version: 0.17. Maximum required version: 1.5.1. Disabling scikit-learn conversion API.\n"
40
+ ]
41
+ }
42
+ ],
43
+ "source": [
44
+ "import torch\n",
45
+ "import coremltools as ct\n",
46
+ "import numpy as np\n",
47
+ "from PIL import Image\n",
48
+ "import tempfile\n",
49
+ "from huggingface_hub import hf_hub_download\n",
50
+ "import sys\n",
51
+ "sys.path.append('./Depth-Anything-V2')\n",
52
+ "\n"
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "execution_count": 4,
58
+ "id": "73882c02",
59
+ "metadata": {},
60
+ "outputs": [
61
+ {
62
+ "name": "stderr",
63
+ "output_type": "stream",
64
+ "text": [
65
+ "xFormers not available\n",
66
+ "xFormers not available\n"
67
+ ]
68
+ }
69
+ ],
70
+ "source": [
71
+ "from depth_anything_v2.dpt import DepthAnythingV2\n",
72
+ "from depth_anything_v2.util.transform import Resize, NormalizeImage, PrepareForNet\n",
73
+ "\n",
74
+ "import torch.nn.functional as F"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "markdown",
79
+ "id": "26f7dcff",
80
+ "metadata": {},
81
+ "source": [
82
+ "# 1. Load Depth-Anything-V2's vitl checkpoint"
83
+ ]
84
+ },
85
+ {
86
+ "cell_type": "code",
87
+ "execution_count": 5,
88
+ "id": "e67aa722",
89
+ "metadata": {},
90
+ "outputs": [],
91
+ "source": [
92
+ "DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'\n",
93
+ "model_configs = {\n",
94
+ " 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},\n",
95
+ " 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},\n",
96
+ " 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},\n",
97
+ " 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}\n",
98
+ "}\n",
99
+ "encoder2name = {\n",
100
+ " 'vits': 'Small',\n",
101
+ " 'vitb': 'Base',\n",
102
+ " 'vitl': 'Large',\n",
103
+ " 'vitg': 'Giant', # we are undergoing company review procedures to release our giant model checkpoint\n",
104
+ "}\n",
105
+ "encoder = 'vits'\n",
106
+ "model_name = encoder2name[encoder]\n",
107
+ "model = DepthAnythingV2(**model_configs[encoder])\n",
108
+ "filepath = hf_hub_download(repo_id=f\"depth-anything/Depth-Anything-V2-{model_name}\", filename=f\"depth_anything_v2_{encoder}.pth\", repo_type=\"model\")\n",
109
+ "state_dict = torch.load(filepath, map_location=\"cpu\")\n",
110
+ "model.load_state_dict(state_dict)\n",
111
+ "model = model.to(DEVICE).eval()"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "code",
116
+ "execution_count": 6,
117
+ "id": "a632e6b4",
118
+ "metadata": {},
119
+ "outputs": [
120
+ {
121
+ "name": "stdout",
122
+ "output_type": "stream",
123
+ "text": [
124
+ "(3024, 4032, 3)\n"
125
+ ]
126
+ }
127
+ ],
128
+ "source": [
129
+ "image = Image.open(\"./sample_images/IMG_4061.jpeg\")\n",
130
+ "img = np.array(image)\n",
131
+ "print(img.shape)\n",
132
+ "h, w = img.shape[:2]\n",
133
+ "depth = model.infer_image(img)\n",
134
+ "depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0\n",
135
+ "depth = depth.astype(np.uint8)\n",
136
+ "depth_image = Image.fromarray(depth)\n",
137
+ "depth_image.save(f\"depth_image_{model_name}_1.jpg\")"
138
+ ]
139
+ },
140
+ {
141
+ "cell_type": "code",
142
+ "execution_count": 7,
143
+ "id": "77477217",
144
+ "metadata": {},
145
+ "outputs": [
146
+ {
147
+ "name": "stdout",
148
+ "output_type": "stream",
149
+ "text": [
150
+ "(3024, 4032, 3)\n"
151
+ ]
152
+ },
153
+ {
154
+ "name": "stderr",
155
+ "output_type": "stream",
156
+ "text": [
157
+ "/Users/dadler/Projects/Glide/ai-bots/depth/./Depth-Anything-V2/depth_anything_v2/dinov2_layers/patch_embed.py:73: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
158
+ " assert H % patch_H == 0, f\"Input image height {H} is not a multiple of patch height {patch_H}\"\n",
159
+ "/Users/dadler/Projects/Glide/ai-bots/depth/./Depth-Anything-V2/depth_anything_v2/dinov2_layers/patch_embed.py:74: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
160
+ " assert W % patch_W == 0, f\"Input image width {W} is not a multiple of patch width: {patch_W}\"\n",
161
+ "/Users/dadler/Projects/Glide/ai-bots/depth/./Depth-Anything-V2/depth_anything_v2/dinov2.py:183: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
162
+ " if npatch == N and w == h:\n",
163
+ "/Users/dadler/Projects/Glide/ai-bots/depth/./Depth-Anything-V2/depth_anything_v2/dpt.py:147: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
164
+ " out = F.interpolate(out, (int(patch_h * 14), int(patch_w * 14)), mode=\"bilinear\", align_corners=True)\n"
165
+ ]
166
+ }
167
+ ],
168
+ "source": [
169
+ "original_image = Image.open(\"./sample_images/IMG_4061.jpeg\")\n",
170
+ "origina_img = np.array(original_image)\n",
171
+ "print(origina_img.shape)\n",
172
+ "original_h, original_w = origina_img.shape[:2]\n",
173
+ "# Resize the image to the input size, width must be 518 and height must be divisible by 14\n",
174
+ "input_size_w = 518\n",
175
+ "#input_size_h = 392 #To have this work, you need to patch dinov2.py \n",
176
+ "input_size_h = 518\n",
177
+ "image = original_image.resize((input_size_w,input_size_h), Image.Resampling.BILINEAR)\n",
178
+ "img = np.array(image)\n",
179
+ "input_image, (h, w) = model.image2tensor(img, input_size_h)\n",
180
+ "input_image = input_image.to(DEVICE)\n",
181
+ "with torch.no_grad():\n",
182
+ " depth = model(input_image)\n",
183
+ " depth = F.interpolate(depth[:, None], (h, w), mode=\"bilinear\", align_corners=True)[0, 0]\n",
184
+ " depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0\n",
185
+ " depth = depth.cpu().numpy().astype(np.uint8)\n",
186
+ "depth_image = Image.fromarray(depth).resize((original_w,original_h), Image.Resampling.BILINEAR)\n",
187
+ "depth_image.save(f\"depth_image_{model_name}_2.jpg\")\n",
188
+ "\n",
189
+ "traced_model = torch.jit.trace(model, input_image)\n"
190
+ ]
191
+ },
192
+ {
193
+ "cell_type": "code",
194
+ "execution_count": 8,
195
+ "id": "42632870",
196
+ "metadata": {},
197
+ "outputs": [
198
+ {
199
+ "name": "stdout",
200
+ "output_type": "stream",
201
+ "text": [
202
+ "Traced PyTorch ImageEncoder ckpt out for jpg:\n",
203
+ ">>> tensor([[0.0157, 0.0149, 0.0080, ..., 0.0410, 0.0407, 0.0510],\n",
204
+ " [0.0043, 0.0084, 0.0000, ..., 0.0359, 0.0472, 0.0514],\n",
205
+ " [0.0027, 0.0058, 0.0000, ..., 0.0333, 0.0354, 0.0526],\n",
206
+ " ...,\n",
207
+ " [0.0135, 0.0170, 0.0090, ..., 0.0534, 0.0506, 0.0532],\n",
208
+ " [0.0157, 0.0203, 0.0122, ..., 0.0559, 0.0546, 0.0420],\n",
209
+ " [0.0191, 0.0238, 0.0168, ..., 0.0588, 0.0576, 0.0648]],\n",
210
+ " device='mps:0', grad_fn=<SliceBackward0>)\n"
211
+ ]
212
+ }
213
+ ],
214
+ "source": [
215
+ "example_output = traced_model(input_image)\n",
216
+ "print(\"Traced PyTorch ImageEncoder ckpt out for jpg:\\n>>>\", example_output[0, :10])"
217
+ ]
218
+ },
219
+ {
220
+ "cell_type": "markdown",
221
+ "id": "3c0d9c70",
222
+ "metadata": {},
223
+ "source": [
224
+ "You can see that there is some loss in precision, but it is still acceptable."
225
+ ]
226
+ },
227
+ {
228
+ "cell_type": "markdown",
229
+ "id": "ca182b4a",
230
+ "metadata": {},
231
+ "source": [
232
+ "# 2. Export ImageEncoder"
233
+ ]
234
+ },
235
+ {
236
+ "cell_type": "code",
237
+ "execution_count": 9,
238
+ "id": "ef7af5c5",
239
+ "metadata": {},
240
+ "outputs": [],
241
+ "source": [
242
+ "image_means = [0.485, 0.456, 0.406]\n",
243
+ "image_stds = [0.229, 0.224, 0.225]"
244
+ ]
245
+ },
246
+ {
247
+ "cell_type": "code",
248
+ "execution_count": 10,
249
+ "id": "8f66a99c",
250
+ "metadata": {},
251
+ "outputs": [],
252
+ "source": [
253
+ "import torchvision.transforms as transforms\n",
254
+ "\n",
255
+ "class Wrapper(torch.nn.Module): \n",
256
+ " def __init__(self, model):\n",
257
+ " super().__init__()\n",
258
+ " _means = image_means\n",
259
+ " _stds = image_stds\n",
260
+ " self.model = model \n",
261
+ " self.stds = torch.tensor(_stds).half()[:,None,None]\n",
262
+ " self.means = torch.tensor(_means).half()[:,None,None]\n",
263
+ "\n",
264
+ " transform_model = torch.nn.Sequential(\n",
265
+ " transforms.Normalize(mean=image_means, std=image_stds)\n",
266
+ " )\n",
267
+ "\n",
268
+ " def forward(self, input): \n",
269
+ " input = input/255.0\n",
270
+ " intput = self.transform_model(input)\n",
271
+ " output = self.model(input)\n",
272
+ " output = (output - output.min()) / (output.max() - output.min()) \n",
273
+ " # Fix \"Image output, 'depthOutput', must have rank 4. Instead it has rank 3\"\n",
274
+ " output = output.unsqueeze(0)\n",
275
+ " # Fix \"Shape of the RGB/BGR image output, 'depthOutput', must be of kind (1, 3, H, W), i.e., first two dimensions must be (1, 3), instead they are: (1, 1)\"ArithmeticError\n",
276
+ " output = output.repeat(1, 3, 1, 1)\n",
277
+ " output = output * 255.0\n",
278
+ " return output\n",
279
+ "\n",
280
+ "# Instantiate the Wrapper model passing the original PyTorch FCN model\n",
281
+ "wrapped_model = Wrapper(traced_model)"
282
+ ]
283
+ },
284
+ {
285
+ "cell_type": "code",
286
+ "execution_count": 11,
287
+ "id": "b3da3350",
288
+ "metadata": {},
289
+ "outputs": [
290
+ {
291
+ "name": "stdout",
292
+ "output_type": "stream",
293
+ "text": [
294
+ "wrapped PyTorch ImageEncoder ckpt out for jpg:\n",
295
+ ">>> tensor([[[ 1.0442, 1.0795, 1.0259, ..., 2.5866, 2.6540, 2.5864],\n",
296
+ " [ 0.9688, 1.2331, 1.0579, ..., 2.8632, 2.9795, 2.7485],\n",
297
+ " [ 0.9795, 1.2034, 0.9449, ..., 2.9342, 2.9196, 2.8207],\n",
298
+ " ...,\n",
299
+ " [100.1750, 100.6220, 100.7177, ..., 97.1819, 96.7440, 97.0862],\n",
300
+ " [100.6218, 100.7040, 100.8275, ..., 97.2966, 97.6106, 97.7243],\n",
301
+ " [ 99.4266, 100.6614, 100.1300, ..., 97.4383, 98.1441, 98.3714]],\n",
302
+ "\n",
303
+ " [[ 1.0442, 1.0795, 1.0259, ..., 2.5866, 2.6540, 2.5864],\n",
304
+ " [ 0.9688, 1.2331, 1.0579, ..., 2.8632, 2.9795, 2.7485],\n",
305
+ " [ 0.9795, 1.2034, 0.9449, ..., 2.9342, 2.9196, 2.8207],\n",
306
+ " ...,\n",
307
+ " [100.1750, 100.6220, 100.7177, ..., 97.1819, 96.7440, 97.0862],\n",
308
+ " [100.6218, 100.7040, 100.8275, ..., 97.2966, 97.6106, 97.7243],\n",
309
+ " [ 99.4266, 100.6614, 100.1300, ..., 97.4383, 98.1441, 98.3714]],\n",
310
+ "\n",
311
+ " [[ 1.0442, 1.0795, 1.0259, ..., 2.5866, 2.6540, 2.5864],\n",
312
+ " [ 0.9688, 1.2331, 1.0579, ..., 2.8632, 2.9795, 2.7485],\n",
313
+ " [ 0.9795, 1.2034, 0.9449, ..., 2.9342, 2.9196, 2.8207],\n",
314
+ " ...,\n",
315
+ " [100.1750, 100.6220, 100.7177, ..., 97.1819, 96.7440, 97.0862],\n",
316
+ " [100.6218, 100.7040, 100.8275, ..., 97.2966, 97.6106, 97.7243],\n",
317
+ " [ 99.4266, 100.6614, 100.1300, ..., 97.4383, 98.1441, 98.3714]]],\n",
318
+ " device='mps:0')\n",
319
+ "Traced wrapped PyTorch ImageEncoder ckpt out for jpg:\n",
320
+ ">>> tensor([[[ 1.0442, 1.0795, 1.0259, ..., 2.5866, 2.6540, 2.5864],\n",
321
+ " [ 0.9688, 1.2331, 1.0579, ..., 2.8632, 2.9795, 2.7485],\n",
322
+ " [ 0.9795, 1.2034, 0.9449, ..., 2.9342, 2.9196, 2.8207],\n",
323
+ " ...,\n",
324
+ " [100.1750, 100.6220, 100.7177, ..., 97.1819, 96.7440, 97.0862],\n",
325
+ " [100.6218, 100.7040, 100.8275, ..., 97.2966, 97.6106, 97.7243],\n",
326
+ " [ 99.4266, 100.6614, 100.1300, ..., 97.4383, 98.1441, 98.3714]],\n",
327
+ "\n",
328
+ " [[ 1.0442, 1.0795, 1.0259, ..., 2.5866, 2.6540, 2.5864],\n",
329
+ " [ 0.9688, 1.2331, 1.0579, ..., 2.8632, 2.9795, 2.7485],\n",
330
+ " [ 0.9795, 1.2034, 0.9449, ..., 2.9342, 2.9196, 2.8207],\n",
331
+ " ...,\n",
332
+ " [100.1750, 100.6220, 100.7177, ..., 97.1819, 96.7440, 97.0862],\n",
333
+ " [100.6218, 100.7040, 100.8275, ..., 97.2966, 97.6106, 97.7243],\n",
334
+ " [ 99.4266, 100.6614, 100.1300, ..., 97.4383, 98.1441, 98.3714]],\n",
335
+ "\n",
336
+ " [[ 1.0442, 1.0795, 1.0259, ..., 2.5866, 2.6540, 2.5864],\n",
337
+ " [ 0.9688, 1.2331, 1.0579, ..., 2.8632, 2.9795, 2.7485],\n",
338
+ " [ 0.9795, 1.2034, 0.9449, ..., 2.9342, 2.9196, 2.8207],\n",
339
+ " ...,\n",
340
+ " [100.1750, 100.6220, 100.7177, ..., 97.1819, 96.7440, 97.0862],\n",
341
+ " [100.6218, 100.7040, 100.8275, ..., 97.2966, 97.6106, 97.7243],\n",
342
+ " [ 99.4266, 100.6614, 100.1300, ..., 97.4383, 98.1441, 98.3714]]],\n",
343
+ " device='mps:0')\n"
344
+ ]
345
+ }
346
+ ],
347
+ "source": [
348
+ "i = np.asarray(original_image.resize((input_size_w, input_size_h)))\n",
349
+ "i = i.astype(\"float32\")\n",
350
+ "i = np.transpose(i, (2, 0, 1))\n",
351
+ "i = np.expand_dims(i, 0)\n",
352
+ "i = torch.from_numpy(i).to(DEVICE)\n",
353
+ "\n",
354
+ "with torch.no_grad():\n",
355
+ " out = wrapped_model(i)\n",
356
+ "\n",
357
+ "print(\"wrapped PyTorch ImageEncoder ckpt out for jpg:\\n>>>\", out[0, :10])\n",
358
+ "\n",
359
+ "traced_model_w = torch.jit.trace(wrapped_model, i)\n",
360
+ "\n",
361
+ "with torch.no_grad():\n",
362
+ " out = traced_model_w(i)\n",
363
+ "\n",
364
+ "print(\"Traced wrapped PyTorch ImageEncoder ckpt out for jpg:\\n>>>\", out[0, :10])"
365
+ ]
366
+ },
367
+ {
368
+ "cell_type": "code",
369
+ "execution_count": 12,
370
+ "id": "db5cb9b9",
371
+ "metadata": {},
372
+ "outputs": [
373
+ {
374
+ "data": {
375
+ "text/plain": [
376
+ "(torch.Size([1, 3, 518, 518]), torch.Size([1, 3, 518, 518]))"
377
+ ]
378
+ },
379
+ "execution_count": 12,
380
+ "metadata": {},
381
+ "output_type": "execute_result"
382
+ }
383
+ ],
384
+ "source": [
385
+ "i.shape, out.shape"
386
+ ]
387
+ },
388
+ {
389
+ "cell_type": "code",
390
+ "execution_count": 13,
391
+ "id": "681683aa",
392
+ "metadata": {},
393
+ "outputs": [
394
+ {
395
+ "name": "stdout",
396
+ "output_type": "stream",
397
+ "text": [
398
+ "(1, 3, 518, 518) 255.0 0.0 101.90155\n",
399
+ "(518, 518, 3) 255 0 101.40160403094767\n"
400
+ ]
401
+ }
402
+ ],
403
+ "source": [
404
+ "tmp = out.cpu().numpy()\n",
405
+ "\n",
406
+ "print(tmp.shape, tmp.max(), tmp.min(), tmp.mean())\n",
407
+ "# Convert to 3, 256, 256\n",
408
+ "tmp = np.transpose(tmp, (0, 2, 3, 1)).astype(np.uint8)\n",
409
+ "tmp = tmp.squeeze()\n",
410
+ "print(tmp.shape, tmp.max(), tmp.min(), tmp.mean())\n",
411
+ "Image.fromarray(tmp)\n",
412
+ "tmp_image = Image.fromarray(tmp).resize((original_w,original_h))\n",
413
+ "tmp_image.save(f\"depth_image_{model_name}_3.png\")"
414
+ ]
415
+ },
416
+ {
417
+ "cell_type": "code",
418
+ "execution_count": 14,
419
+ "id": "9e4f00bd",
420
+ "metadata": {},
421
+ "outputs": [
422
+ {
423
+ "data": {
424
+ "text/plain": [
425
+ "torch.Size([1, 3, 518, 518])"
426
+ ]
427
+ },
428
+ "execution_count": 14,
429
+ "metadata": {},
430
+ "output_type": "execute_result"
431
+ }
432
+ ],
433
+ "source": [
434
+ "i.shape"
435
+ ]
436
+ },
437
+ {
438
+ "cell_type": "code",
439
+ "execution_count": 15,
440
+ "id": "304ae7b0",
441
+ "metadata": {},
442
+ "outputs": [
443
+ {
444
+ "name": "stderr",
445
+ "output_type": "stream",
446
+ "text": [
447
+ "Converting PyTorch Frontend ==> MIL Ops: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 779/780 [00:00<00:00, 7178.40 ops/s]\n",
448
+ "Running MIL frontend_pytorch pipeline: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 5/5 [00:00<00:00, 150.72 passes/s]\n",
449
+ "Running MIL default pipeline: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 89/89 [00:01<00:00, 64.35 passes/s] \n",
450
+ "Running MIL backend_mlprogram pipeline: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 12/12 [00:00<00:00, 165.76 passes/s]\n"
451
+ ]
452
+ }
453
+ ],
454
+ "source": [
455
+ "traced_model_w.eval()\n",
456
+ "image_input = ct.ImageType(name=\"colorImage\", shape=i.shape)\n",
457
+ "image_encoder_model = ct.converters.convert(\n",
458
+ " traced_model_w,\n",
459
+ " convert_to=\"mlprogram\",\n",
460
+ " inputs=[image_input],\n",
461
+ " outputs=[ct.ImageType(name=\"depthOutput\")],\n",
462
+ " minimum_deployment_target=ct.target.iOS16,\n",
463
+ ")\n",
464
+ "image_encoder_model.save(f\"DepthAnything_v2_{model_name}_{input_size_w}x{input_size_h}_Box.mlpackage\")"
465
+ ]
466
+ }
467
+ ],
468
+ "metadata": {
469
+ "kernelspec": {
470
+ "display_name": "pytorch2",
471
+ "language": "python",
472
+ "name": "python3"
473
+ },
474
+ "language_info": {
475
+ "codemirror_mode": {
476
+ "name": "ipython",
477
+ "version": 3
478
+ },
479
+ "file_extension": ".py",
480
+ "mimetype": "text/x-python",
481
+ "name": "python",
482
+ "nbconvert_exporter": "python",
483
+ "pygments_lexer": "ipython3",
484
+ "version": "3.10.14"
485
+ }
486
+ },
487
+ "nbformat": 4,
488
+ "nbformat_minor": 5
489
+ }
sample_images/IMG_4061.jpeg ADDED

Git LFS Details

  • SHA256: a30353ab460c8b9e8ba5540f0ee23177c27953a3d74057a0d1f4ee321241c943
  • Pointer size: 132 Bytes
  • Size of remote file: 3.6 MB
sample_images/Xcode_Preview_DepthAnything_v2-Large.jpg ADDED

Git LFS Details

  • SHA256: 99c31bb69beb8a55c46313c970955393a5d3357af3d6e05506451e002d8321dd
  • Pointer size: 131 Bytes
  • Size of remote file: 217 kB
sample_images/Xcode_Preview_DepthAnything_v2_Small_518x392_Landscape.jpg ADDED

Git LFS Details

  • SHA256: 853b51df66967560c6e45b411632265269a9e753f79f46981122644592db9dcd
  • Pointer size: 131 Bytes
  • Size of remote file: 271 kB