LPX55 commited on
Commit
4c6c42a
·
verified ·
1 Parent(s): 73f5f27

Create scripts/convert_to_pytorch.py

Browse files
Files changed (1) hide show
  1. scripts/convert_to_pytorch.py +240 -0
scripts/convert_to_pytorch.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Convert ViT and non-distilled DeiT checkpoints from the timm library."""
2
+
3
+ import argparse
4
+ from pathlib import Path
5
+
6
+ import requests
7
+ import timm
8
+ import torch
9
+ from PIL import Image
10
+ from timm.data import ImageNetInfo, infer_imagenet_subset
11
+
12
+ from transformers import DeiTImageProcessor, ViTConfig, ViTForImageClassification, ViTImageProcessor, ViTModel
13
+ from transformers.utils import logging
14
+
15
+
16
+ logging.set_verbosity_info()
17
+ logger = logging.get_logger(__name__)
18
+
19
+
20
+ # here we list all keys to be renamed (original name on the left, our name on the right)
21
+ def create_rename_keys(config, base_model=False):
22
+ rename_keys = []
23
+ for i in range(config.num_hidden_layers):
24
+ # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
25
+ rename_keys.append((f"blocks.{i}.norm1.weight", f"vit.encoder.layer.{i}.layernorm_before.weight"))
26
+ rename_keys.append((f"blocks.{i}.norm1.bias", f"vit.encoder.layer.{i}.layernorm_before.bias"))
27
+ rename_keys.append((f"blocks.{i}.attn.proj.weight", f"vit.encoder.layer.{i}.attention.output.dense.weight"))
28
+ rename_keys.append((f"blocks.{i}.attn.proj.bias", f"vit.encoder.layer.{i}.attention.output.dense.bias"))
29
+ rename_keys.append((f"blocks.{i}.norm2.weight", f"vit.encoder.layer.{i}.layernorm_after.weight"))
30
+ rename_keys.append((f"blocks.{i}.norm2.bias", f"vit.encoder.layer.{i}.layernorm_after.bias"))
31
+ rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"vit.encoder.layer.{i}.intermediate.dense.weight"))
32
+ rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"vit.encoder.layer.{i}.intermediate.dense.bias"))
33
+ rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"vit.encoder.layer.{i}.output.dense.weight"))
34
+ rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"vit.encoder.layer.{i}.output.dense.bias"))
35
+
36
+ # projection layer + position embeddings
37
+ rename_keys.extend(
38
+ [
39
+ ("cls_token", "vit.embeddings.cls_token"),
40
+ ("patch_embed.proj.weight", "vit.embeddings.patch_embeddings.projection.weight"),
41
+ ("patch_embed.proj.bias", "vit.embeddings.patch_embeddings.projection.bias"),
42
+ ("pos_embed", "vit.embeddings.position_embeddings"),
43
+ ]
44
+ )
45
+
46
+ if base_model:
47
+ # layernorm
48
+ rename_keys.extend(
49
+ [
50
+ ("norm.weight", "layernorm.weight"),
51
+ ("norm.bias", "layernorm.bias"),
52
+ ]
53
+ )
54
+
55
+ # if just the base model, we should remove "vit" from all keys that start with "vit"
56
+ rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("vit") else pair for pair in rename_keys]
57
+ else:
58
+ # layernorm + classification head
59
+ rename_keys.extend(
60
+ [
61
+ ("norm.weight", "vit.layernorm.weight"),
62
+ ("norm.bias", "vit.layernorm.bias"),
63
+ ("head.weight", "classifier.weight"),
64
+ ("head.bias", "classifier.bias"),
65
+ ]
66
+ )
67
+
68
+ return rename_keys
69
+
70
+
71
+ # we split up the matrix of each encoder layer into queries, keys and values
72
+ def read_in_q_k_v(state_dict, config, base_model=False):
73
+ for i in range(config.num_hidden_layers):
74
+ if base_model:
75
+ prefix = ""
76
+ else:
77
+ prefix = "vit."
78
+ # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
79
+ in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
80
+ in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
81
+ # next, add query, keys and values (in that order) to the state dict
82
+ state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
83
+ : config.hidden_size, :
84
+ ]
85
+ state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
86
+ state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
87
+ config.hidden_size : config.hidden_size * 2, :
88
+ ]
89
+ state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
90
+ config.hidden_size : config.hidden_size * 2
91
+ ]
92
+ state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
93
+ -config.hidden_size :, :
94
+ ]
95
+ state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
96
+
97
+
98
+ def remove_classification_head_(state_dict):
99
+ ignore_keys = ["head.weight", "head.bias"]
100
+ for k in ignore_keys:
101
+ state_dict.pop(k, None)
102
+
103
+
104
+ def rename_key(dct, old, new):
105
+ val = dct.pop(old)
106
+ dct[new] = val
107
+
108
+
109
+ # We will verify our results on an image of cute cats
110
+ def prepare_img():
111
+ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
112
+ im = Image.open(requests.get(url, stream=True).raw)
113
+ return im
114
+
115
+
116
+ @torch.no_grad()
117
+ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
118
+ """
119
+ Copy/paste/tweak model's weights to our ViT structure.
120
+ """
121
+
122
+ # define default ViT configuration
123
+ config = ViTConfig()
124
+ base_model = False
125
+
126
+ # load original model from timm
127
+ timm_model = timm.create_model(vit_name, pretrained=True)
128
+ timm_model.eval()
129
+
130
+ # detect unsupported ViT models in transformers
131
+ # fc_norm is present
132
+ if not isinstance(getattr(timm_model, "fc_norm", None), torch.nn.Identity):
133
+ raise ValueError(f"{vit_name} is not supported in transformers because of the presence of fc_norm.")
134
+
135
+ # use of global average pooling in combination (or without) class token
136
+ if getattr(timm_model, "global_pool", None) == "avg":
137
+ raise ValueError(f"{vit_name} is not supported in transformers because of use of global average pooling.")
138
+
139
+ # CLIP style vit with norm_pre layer present
140
+ if "clip" in vit_name and not isinstance(getattr(timm_model, "norm_pre", None), torch.nn.Identity):
141
+ raise ValueError(
142
+ f"{vit_name} is not supported in transformers because it's a CLIP style ViT with norm_pre layer."
143
+ )
144
+
145
+ # SigLIP style vit with attn_pool layer present
146
+ if "siglip" in vit_name and getattr(timm_model, "global_pool", None) == "map":
147
+ raise ValueError(
148
+ f"{vit_name} is not supported in transformers because it's a SigLIP style ViT with attn_pool."
149
+ )
150
+
151
+ # use of layer scale in ViT model blocks
152
+ if not isinstance(getattr(timm_model.blocks[0], "ls1", None), torch.nn.Identity) or not isinstance(
153
+ getattr(timm_model.blocks[0], "ls2", None), torch.nn.Identity
154
+ ):
155
+ raise ValueError(f"{vit_name} is not supported in transformers because it uses a layer scale in its blocks.")
156
+
157
+ # Hybrid ResNet-ViTs
158
+ if not isinstance(timm_model.patch_embed, timm.layers.PatchEmbed):
159
+ raise ValueError(f"{vit_name} is not supported in transformers because it is a hybrid ResNet-ViT.")
160
+
161
+ # get patch size and image size from the patch embedding submodule
162
+ config.patch_size = timm_model.patch_embed.patch_size[0]
163
+ config.image_size = timm_model.patch_embed.img_size[0]
164
+
165
+ # retrieve architecture-specific parameters from the timm model
166
+ config.hidden_size = timm_model.embed_dim
167
+ config.intermediate_size = timm_model.blocks[0].mlp.fc1.out_features
168
+ config.num_hidden_layers = len(timm_model.blocks)
169
+ config.num_attention_heads = timm_model.blocks[0].attn.num_heads
170
+
171
+ # check whether the model has a classification head or not
172
+ if timm_model.num_classes != 0:
173
+ config.num_labels = timm_model.num_classes
174
+ # infer ImageNet subset from timm model
175
+ imagenet_subset = infer_imagenet_subset(timm_model)
176
+ dataset_info = ImageNetInfo(imagenet_subset)
177
+ config.id2label = {i: dataset_info.index_to_label_name(i) for i in range(dataset_info.num_classes())}
178
+ config.label2id = {v: k for k, v in config.id2label.items()}
179
+ else:
180
+ print(f"{vit_name} is going to be converted as a feature extractor only.")
181
+ base_model = True
182
+
183
+ # load state_dict of original model
184
+ state_dict = timm_model.state_dict()
185
+
186
+ # remove and rename some keys in the state dict
187
+ if base_model:
188
+ remove_classification_head_(state_dict)
189
+ rename_keys = create_rename_keys(config, base_model)
190
+ for src, dest in rename_keys:
191
+ rename_key(state_dict, src, dest)
192
+ read_in_q_k_v(state_dict, config, base_model)
193
+
194
+ # load HuggingFace model
195
+ if base_model:
196
+ model = ViTModel(config, add_pooling_layer=False).eval()
197
+ else:
198
+ model = ViTForImageClassification(config).eval()
199
+ model.load_state_dict(state_dict)
200
+
201
+ # Check outputs on an image, prepared by ViTImageProcessor/DeiTImageProcessor
202
+ if "deit" in vit_name:
203
+ image_processor = DeiTImageProcessor(size=config.image_size)
204
+ else:
205
+ image_processor = ViTImageProcessor(size=config.image_size)
206
+ encoding = image_processor(images=prepare_img(), return_tensors="pt")
207
+ pixel_values = encoding["pixel_values"]
208
+ outputs = model(pixel_values)
209
+
210
+ if base_model:
211
+ timm_pooled_output = timm_model.forward_features(pixel_values)
212
+ assert timm_pooled_output.shape == outputs.last_hidden_state.shape
213
+ assert torch.allclose(timm_pooled_output, outputs.last_hidden_state, atol=1e-1)
214
+ else:
215
+ timm_logits = timm_model(pixel_values)
216
+ assert timm_logits.shape == outputs.logits.shape
217
+ assert torch.allclose(timm_logits, outputs.logits, atol=1e-3)
218
+
219
+ Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
220
+ print(f"Saving model {vit_name} to {pytorch_dump_folder_path}")
221
+ model.save_pretrained(pytorch_dump_folder_path)
222
+ print(f"Saving image processor to {pytorch_dump_folder_path}")
223
+ image_processor.save_pretrained(pytorch_dump_folder_path)
224
+
225
+
226
+ if __name__ == "__main__":
227
+ parser = argparse.ArgumentParser()
228
+ # Required parameters
229
+ parser.add_argument(
230
+ "--vit_name",
231
+ default="vit_base_patch16_224",
232
+ type=str,
233
+ help="Name of the ViT timm model you'd like to convert.",
234
+ )
235
+ parser.add_argument(
236
+ "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
237
+ )
238
+
239
+ args = parser.parse_args()
240
+ convert_vit_checkpoint(args.vit_name, args.pytorch_dump_folder_path)