# -------------------------------------------------------- # InternImage # Copyright (c) 2025 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- from transformers import PretrainedConfig class InternImageConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`~InternImageModel`]. It is used to instantiate an internimage model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the internimage [OpenGVLab/internimage](https://huggingface.co/OpenGVLab/internimage) architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. Args: core_op (`str`, *optional*, defaults to `"DCNv3"`): Core operation used in the InternImageModel. depths (`tuple`, *optional*, defaults to `(4, 4, 18, 4)`): Tuple specifying the depth of layers in the InternImageModel. groups (`tuple`, *optional*, defaults to `(4, 8, 16, 32)`): Tuple specifying the group of layers in the InternImageModel. channels (`int`, *optional*, defaults to `64`): Number of channels in the InternImageModel. dw_kernel_size (`int`, *optional*, defaults to `None`): Kernel size for depthwise convolutions. layer_scale (`float`, *optional*, defaults to `None`): Scale of the layers in the model. offset_scale (`float`, *optional*, defaults to `1.0`): Offset scale in the model. mlp_ratio (`float`, *optional*, defaults to `4.0`): Ratio of mlp layers in the InternImageModel. post_norm (`bool`, *optional*, defaults to `False`): Whether to use post normalization in the model. level2_post_norm (`bool`, *optional*, defaults to `False`): Whether to use level 2 post normalization. level2_post_norm_block_ids (`list`, *optional*, defaults to `None`): Specific block IDs for level 2 post normalization. center_feature_scale (`bool`, *optional*, defaults to `False`): Whether to apply center feature scaling. use_clip_projector (`bool`, *optional*, defaults to `False`): Whether to use CLIP projector. remove_center (`bool`, *optional*, defaults to `False`): Whether to remove center pixels in some operations. num_classes (`int`, *optional*, defaults to `1000`): Number of classes for the model output. drop_rate (`float`, *optional*, defaults to `0.0`): Dropout rate in the model. drop_path_rate (`float`, *optional*, defaults to `0.0`): Dropout path rate in the model. drop_path_type (`str`, *optional*, defaults to `"linear"`): Type of dropout path used in the model. act_layer (`str`, *optional*, defaults to `"GELU"`): Activation function used in the model. norm_layer (`str`, *optional*, defaults to `"LN"`): Normalization layer used in the model. cls_scale (`float`, *optional*, defaults to `1.5`): Scale of the classification layer in the model. with_cp (`bool`, *optional*, defaults to `False`): Whether to use checkpointing in the model. """ model_type = 'internimage' def __init__( self, core_op='DCNv3', depths=(4, 4, 18, 4), groups=(4, 8, 16, 32), channels=64, dw_kernel_size=None, layer_scale=None, offset_scale=1.0, mlp_ratio=4.0, post_norm=False, res_post_norm=False, level2_post_norm=False, level2_post_norm_block_ids=None, center_feature_scale=False, use_clip_projector=False, remove_center=False, num_classes=1000, drop_rate=0.0, drop_path_rate=0.0, drop_path_type='linear', act_layer='GELU', norm_layer='LN', cls_scale=1.5, with_cp=False, **kwargs, ): super().__init__(**kwargs) # Model configuration parameters self.core_op = core_op self.depths = depths self.groups = groups self.channels = channels self.dw_kernel_size = dw_kernel_size self.layer_scale = layer_scale self.offset_scale = offset_scale self.mlp_ratio = mlp_ratio self.post_norm = post_norm self.res_post_norm = res_post_norm self.level2_post_norm = level2_post_norm self.level2_post_norm_block_ids = level2_post_norm_block_ids self.center_feature_scale = center_feature_scale self.use_clip_projector = use_clip_projector self.remove_center = remove_center self.num_classes = num_classes self.drop_rate = drop_rate self.drop_path_rate = drop_path_rate self.drop_path_type = drop_path_type self.act_layer = act_layer self.norm_layer = norm_layer self.cls_scale = cls_scale self.with_cp = with_cp