from transformers.configuration_utils import PretrainedConfig BACKBONE_NAME2WIDTH = { "swin_tiny_patch4_window7_224": 768, "swin_small_patch4_window7_224": 768, "swin_base_patch4_window7_224": 1024, "solider_tiny": 768, "solider_small": 768, "solider_base": 1024, } class SOLIDERConfig(PretrainedConfig): model_type = "swin_transformer" def __init__( self, pretrain_img_size=224, in_channels=3, embed_dims=96, patch_size=4, window_size=7, mlp_ratio=4, depths=(2, 2, 6, 2), num_heads=(3, 6, 12, 24), strides=(4, 2, 2, 2), out_indices=(0, 1, 2, 3), qkv_bias=True, qk_scale=None, patch_norm=True, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0, # NOTE: I modified this from the implemenation of SOLIDER use_abs_pos_embed=False, act_cfg=dict(type="GELU"), norm_cfg=dict(type="LN"), with_cp=False, pretrained=None, convert_weights=False, frozen_stages=-1, init_cfg=None, semantic_weight=0.5, # NOTE: I modified this from the implemenation of SOLIDER name="solider_small", **kwargs, ): self.pretrain_img_size = pretrain_img_size self.in_channels = in_channels self.embed_dims = embed_dims self.patch_size = patch_size self.window_size = window_size self.mlp_ratio = mlp_ratio self.depths = depths self.num_heads = num_heads self.strides = strides self.out_indices = out_indices self.qkv_bias = qkv_bias self.qk_scale = qk_scale self.patch_norm = patch_norm self.drop_rate = drop_rate self.attn_drop_rate = attn_drop_rate self.drop_path_rate = drop_path_rate self.use_abs_pos_embed = use_abs_pos_embed self.act_cfg = act_cfg self.norm_cfg = norm_cfg self.with_cp = with_cp self.pretrained = pretrained self.convert_weights = convert_weights self.frozen_stages = frozen_stages self.init_cfg = init_cfg self.semantic_weight = semantic_weight # NOTE: These below attributes are just for provide information! # They are not effect on model building! self.img_size = pretrain_img_size assert name in BACKBONE_NAME2WIDTH self.name = name self.vision_width = BACKBONE_NAME2WIDTH[self.name] self.hidden_size = self.embed_dims super().__init__(**kwargs)