update config since changes in model architecture

Browse files

Files changed (3) hide show

README.md +4 -4
assets/tiger.jpg +0 -0
config.json +130 -19

README.md CHANGED Viewed

@@ -8,7 +8,7 @@ pipeline_tag: depth-estimation
 Install the required libraries:
 ```bash
 pip install -q numpy pillow torch torchvision
-pip install -q git+https://github.com/geetu040/transformers.git@depth-pro-projects#egg=transformers
 ```
 Import the required libraries:
@@ -22,14 +22,14 @@ from huggingface_hub import hf_hub_download
 import matplotlib.pyplot as plt
 # custom installation from this PR: https://github.com/huggingface/transformers/pull/34583
-# !pip install git+https://github.com/geetu040/transformers.git@depth-pro-projects#egg=transformers
 from transformers import DepthProConfig, DepthProImageProcessorFast, DepthProForDepthEstimation
 ```
 Load the model and image processor:
 ```py
 checkpoint = "geetu040/DepthPro"
-revision = "project"
 image_processor = DepthProImageProcessorFast.from_pretrained(checkpoint, revision=revision)
 model = DepthProForDepthEstimation.from_pretrained(checkpoint, revision=revision)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -40,7 +40,7 @@ Inference:
 ```py
 # inference
-url = "https://huggingface.co/spaces/geetu040/DepthPro_Segmentation_Human/resolve/main/assets/examples/man_with_arms_open.jpg"
 image = Image.open(requests.get(url, stream=True).raw)
 image = image.convert("RGB")

 Install the required libraries:
 ```bash
 pip install -q numpy pillow torch torchvision
+pip install -q git+https://github.com/geetu040/transformers.git@depth-pro#egg=transformers
 ```
 Import the required libraries:
 import matplotlib.pyplot as plt
 # custom installation from this PR: https://github.com/huggingface/transformers/pull/34583
+# !pip install git+https://github.com/geetu040/transformers.git@depth-pro#egg=transformers
 from transformers import DepthProConfig, DepthProImageProcessorFast, DepthProForDepthEstimation
 ```
 Load the model and image processor:
 ```py
 checkpoint = "geetu040/DepthPro"
+revision = "main"
 image_processor = DepthProImageProcessorFast.from_pretrained(checkpoint, revision=revision)
 model = DepthProForDepthEstimation.from_pretrained(checkpoint, revision=revision)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 ```py
 # inference
+url = "https://huggingface.co/geetu040/DepthPro/resolve/main/assets/tiger.jpg"
 image = Image.open(requests.get(url, stream=True).raw)
 image = image.convert("RGB")

assets/tiger.jpg ADDED Viewed

config.json CHANGED Viewed

@@ -1,14 +1,92 @@
 {
-  "apply_layernorm": true,
   "architectures": [
     "DepthProForDepthEstimation"
   ],
-  "attention_probs_dropout_prob": 0.0,
-  "drop_path_rate": 0.0,
   "fusion_hidden_size": 256,
-  "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.0,
-  "hidden_size": 1024,
   "initializer_range": 0.02,
   "intermediate_feature_dims": [
     256,
@@ -18,18 +96,52 @@
     11,
     5
   ],
-  "layer_norm_eps": 1e-06,
-  "layerscale_value": 1.0,
-  "mlp_ratio": 4,
   "model_type": "depth_pro",
-  "num_attention_heads": 16,
-  "num_channels": 3,
   "num_fov_head_layers": 2,
-  "num_hidden_layers": 24,
-  "patch_embeddings_size": 16,
   "patch_size": 384,
-  "qkv_bias": true,
-  "reshape_hidden_states": true,
   "scaled_images_feature_dims": [
     1024,
     1024,
@@ -45,10 +157,9 @@
     0.5,
     1
   ],
-  "torch_dtype": "float32",
-  "transformers_version": "4.48.0.dev0",
   "use_batch_norm_in_fusion_residual": false,
   "use_bias_in_fusion_residual": true,
-  "use_fov_model": true,
-  "use_swiglu_ffn": false
 }

 {
   "architectures": [
     "DepthProForDepthEstimation"
   ],
+  "fov_model_config": {
+    "hidden_size": 1024,
+    "image_size": 384,
+    "model_type": "dinov2",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 24,
+    "out_features": [
+      "stage24"
+    ],
+    "out_indices": [
+      24
+    ],
+    "patch_size": 16,
+    "stage_names": [
+      "stem",
+      "stage1",
+      "stage2",
+      "stage3",
+      "stage4",
+      "stage5",
+      "stage6",
+      "stage7",
+      "stage8",
+      "stage9",
+      "stage10",
+      "stage11",
+      "stage12",
+      "stage13",
+      "stage14",
+      "stage15",
+      "stage16",
+      "stage17",
+      "stage18",
+      "stage19",
+      "stage20",
+      "stage21",
+      "stage22",
+      "stage23",
+      "stage24"
+    ],
+    "use_mask_token": false
+  },
   "fusion_hidden_size": 256,
+  "image_model_config": {
+    "hidden_size": 1024,
+    "image_size": 384,
+    "model_type": "dinov2",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 24,
+    "out_features": [
+      "stage24"
+    ],
+    "out_indices": [
+      24
+    ],
+    "patch_size": 16,
+    "stage_names": [
+      "stem",
+      "stage1",
+      "stage2",
+      "stage3",
+      "stage4",
+      "stage5",
+      "stage6",
+      "stage7",
+      "stage8",
+      "stage9",
+      "stage10",
+      "stage11",
+      "stage12",
+      "stage13",
+      "stage14",
+      "stage15",
+      "stage16",
+      "stage17",
+      "stage18",
+      "stage19",
+      "stage20",
+      "stage21",
+      "stage22",
+      "stage23",
+      "stage24"
+    ],
+    "use_mask_token": false
+  },
   "initializer_range": 0.02,
   "intermediate_feature_dims": [
     256,
     11,
     5
   ],
+  "merge_padding_value": 3,
   "model_type": "depth_pro",
   "num_fov_head_layers": 2,
+  "patch_model_config": {
+    "hidden_size": 1024,
+    "image_size": 384,
+    "model_type": "dinov2",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 24,
+    "out_features": [
+      "stage24"
+    ],
+    "out_indices": [
+      24
+    ],
+    "patch_size": 16,
+    "stage_names": [
+      "stem",
+      "stage1",
+      "stage2",
+      "stage3",
+      "stage4",
+      "stage5",
+      "stage6",
+      "stage7",
+      "stage8",
+      "stage9",
+      "stage10",
+      "stage11",
+      "stage12",
+      "stage13",
+      "stage14",
+      "stage15",
+      "stage16",
+      "stage17",
+      "stage18",
+      "stage19",
+      "stage20",
+      "stage21",
+      "stage22",
+      "stage23",
+      "stage24"
+    ],
+    "use_mask_token": false
+  },
   "patch_size": 384,
   "scaled_images_feature_dims": [
     1024,
     1024,
     0.5,
     1
   ],
+  "torch_dtype": "float16",
+  "transformers_version": "4.49.0.dev0",
   "use_batch_norm_in_fusion_residual": false,
   "use_bias_in_fusion_residual": true,
+  "use_fov_model": true
 }