Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

configuration_intern_vit.py +117 -0
configuration_internvl_chat.py +2 -107
modeling_intern_vit.py +72 -1
modeling_internvl_chat.py +1 -1
preprocessor_config.json +1 -1
special_tokens_map.json +20 -53
tokenizer_config.json +32 -33

configuration_intern_vit.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import os
+from typing import Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class InternVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
+    instantiate a vision encoder according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of color channels in the input images (e.g., 3 for RGB).
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        qkv_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the queries and values in the self-attention layers.
+        hidden_size (`int`, *optional*, defaults to 3200):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_attention_heads (`int`, *optional*, defaults to 25):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 12800):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        qk_normalization (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the queries and keys in the self-attention layers.
+        num_hidden_layers (`int`, *optional*, defaults to 48):
+            Number of hidden layers in the Transformer encoder.
+        use_flash_attn (`bool`, *optional*, defaults to `True`):
+            Whether to use flash attention mechanism.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Dropout rate for stochastic depth.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 0.1):
+            A factor for layer scale.
+    """
+    model_type = 'intern_vit_6b'
+    def __init__(
+            self,
+            num_channels=3,
+            patch_size=14,
+            image_size=224,
+            qkv_bias=False,
+            hidden_size=3200,
+            num_attention_heads=25,
+            intermediate_size=12800,
+            qk_normalization=True,
+            num_hidden_layers=48,
+            use_flash_attn=True,
+            hidden_act='gelu',
+            layer_norm_eps=1e-6,
+            dropout=0.0,
+            drop_path_rate=0.0,
+            attention_dropout=0.0,
+            initializer_range=0.02,
+            initializer_factor=0.1,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.drop_path_rate = drop_path_rate
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.qkv_bias = qkv_bias
+        self.qk_normalization = qk_normalization
+        self.use_flash_attn = use_flash_attn
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        if 'vision_config' in config_dict:
+            config_dict = config_dict['vision_config']
+        if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
+            )
+        return cls.from_dict(config_dict, **kwargs)

configuration_internvl_chat.py CHANGED Viewed

@@ -4,121 +4,16 @@
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
-import os
 import copy
-from typing import Union
 from transformers import LlamaConfig
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
-logger = logging.get_logger(__name__)
-class InternVisionConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
-    instantiate a vision encoder according to the specified arguments, defining the model architecture.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-    Args:
-        num_channels (`int`, *optional*, defaults to 3):
-            Number of color channels in the input images (e.g., 3 for RGB).
-        patch_size (`int`, *optional*, defaults to 14):
-            The size (resolution) of each patch.
-        image_size (`int`, *optional*, defaults to 224):
-            The size (resolution) of each image.
-        qkv_bias (`bool`, *optional*, defaults to `False`):
-            Whether to add a bias to the queries and values in the self-attention layers.
-        hidden_size (`int`, *optional*, defaults to 3200):
-            Dimensionality of the encoder layers and the pooler layer.
-        num_attention_heads (`int`, *optional*, defaults to 25):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (`int`, *optional*, defaults to 12800):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        qk_normalization (`bool`, *optional*, defaults to `True`):
-            Whether to normalize the queries and keys in the self-attention layers.
-        num_hidden_layers (`int`, *optional*, defaults to 48):
-            Number of hidden layers in the Transformer encoder.
-        use_flash_attn (`bool`, *optional*, defaults to `True`):
-            Whether to use flash attention mechanism.
-        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
-            The epsilon used by the layer normalization layers.
-        dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        drop_path_rate (`float`, *optional*, defaults to 0.0):
-            Dropout rate for stochastic depth.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (`float`, *optional*, defaults to 0.1):
-            A factor for layer scale.
-    """
-    model_type = 'intern_vit_6b'
-    def __init__(
-            self,
-            num_channels=3,
-            patch_size=14,
-            image_size=224,
-            qkv_bias=False,
-            hidden_size=3200,
-            num_attention_heads=25,
-            intermediate_size=12800,
-            qk_normalization=True,
-            num_hidden_layers=48,
-            use_flash_attn=True,
-            hidden_act='gelu',
-            layer_norm_eps=1e-6,
-            dropout=0.0,
-            drop_path_rate=0.0,
-            attention_dropout=0.0,
-            initializer_range=0.02,
-            initializer_factor=0.1,
-            **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.dropout = dropout
-        self.drop_path_rate = drop_path_rate
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_channels = num_channels
-        self.patch_size = patch_size
-        self.image_size = image_size
-        self.initializer_range = initializer_range
-        self.initializer_factor = initializer_factor
-        self.attention_dropout = attention_dropout
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-        self.qkv_bias = qkv_bias
-        self.qk_normalization = qk_normalization
-        self.use_flash_attn = use_flash_attn
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-        if 'vision_config' in config_dict:
-            config_dict = config_dict['vision_config']
-        if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
-            )
-        return cls.from_dict(config_dict, **kwargs)
 class InternVLChatConfig(PretrainedConfig):

 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
 import copy
 from transformers import LlamaConfig
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
+from .configuration_intern_vit import InternVisionConfig
+logger = logging.get_logger(__name__)
 class InternVLChatConfig(PretrainedConfig):

modeling_intern_vit.py CHANGED Viewed

@@ -20,7 +20,13 @@ from transformers.utils import logging
 from .configuration_intern_vit import InternVisionConfig
 try:
-    from .flash_attention import FlashAttention
     has_flash_attn = True
 except:
     print('FlashAttention is not installed.')
@@ -30,6 +36,70 @@ except:
 logger = logging.get_logger(__name__)
 class InternRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         super().__init__()
@@ -279,6 +349,7 @@ class InternVisionEncoder(nn.Module):
 class InternVisionModel(PreTrainedModel):
     main_input_name = 'pixel_values'
     config_class = InternVisionConfig
     def __init__(self, config: InternVisionConfig):
         super().__init__(config)

 from .configuration_intern_vit import InternVisionConfig
 try:
+    try:  # v1
+        from flash_attn.flash_attn_interface import \
+            flash_attn_unpadded_qkvpacked_func
+    except:  # v2
+        from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
+    from flash_attn.bert_padding import pad_input, unpad_input
     has_flash_attn = True
 except:
     print('FlashAttention is not installed.')
 logger = logging.get_logger(__name__)
+class FlashAttention(nn.Module):
+    """Implement the scaled dot product attention with softmax.
+    Arguments
+    ---------
+        softmax_scale: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.0)
+    """
+    def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
+        super().__init__()
+        self.softmax_scale = softmax_scale
+        self.dropout_p = attention_dropout
+    def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None,
+                max_s=None, need_weights=False):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
+                if unpadded: (nnz, 3, h, d)
+            key_padding_mask: a bool tensor of shape (B, S)
+        """
+        assert not need_weights
+        assert qkv.dtype in [torch.float16, torch.bfloat16]
+        assert qkv.is_cuda
+        if cu_seqlens is None:
+            batch_size = qkv.shape[0]
+            seqlen = qkv.shape[1]
+            if key_padding_mask is None:
+                qkv = rearrange(qkv, 'b s ... -> (b s) ...')
+                max_s = seqlen
+                cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
+                                          device=qkv.device)
+                output = flash_attn_unpadded_qkvpacked_func(
+                    qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                    softmax_scale=self.softmax_scale, causal=causal
+                )
+                output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
+            else:
+                nheads = qkv.shape[-2]
+                x = rearrange(qkv, 'b s three h d -> b s (three h d)')
+                x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
+                x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
+                output_unpad = flash_attn_unpadded_qkvpacked_func(
+                    x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                    softmax_scale=self.softmax_scale, causal=causal
+                )
+                output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
+                                             indices, batch_size, seqlen),
+                                   'b s (h d) -> b s h d', h=nheads)
+        else:
+            assert max_s is not None
+            output = flash_attn_unpadded_qkvpacked_func(
+                qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
+                softmax_scale=self.softmax_scale, causal=causal
+            )
+        return output, None
 class InternRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         super().__init__()
 class InternVisionModel(PreTrainedModel):
     main_input_name = 'pixel_values'
     config_class = InternVisionConfig
+    _no_split_modules = ['InternAttention']
     def __init__(self, config: InternVisionConfig):
         super().__init__(config)

modeling_internvl_chat.py CHANGED Viewed

@@ -23,6 +23,7 @@ logger = logging.get_logger(__name__)
 class InternVLChatModel(PreTrainedModel):
     config_class = InternVLChatConfig
     main_input_name = 'pixel_values'
     def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
         super().__init__(config)
@@ -193,7 +194,6 @@ class InternVLChatModel(PreTrainedModel):
         img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
         self.img_context_token_id = img_context_token_id
         from .conversation import get_conv_template
         template = get_conv_template(self.template)

 class InternVLChatModel(PreTrainedModel):
     config_class = InternVLChatConfig
     main_input_name = 'pixel_values'
+    _no_split_modules = ['InternAttention', 'LlamaDecoderLayer', 'LlamaForCausalLM']
     def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
         super().__init__(config)
         img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
         self.img_context_token_id = img_context_token_id
         from .conversation import get_conv_template
         template = get_conv_template(self.template)

preprocessor_config.json CHANGED Viewed

@@ -16,4 +16,4 @@
   ],
   "resample": 3,
   "size": 448
-}

   ],
   "resample": 3,
   "size": 448
+}

special_tokens_map.json CHANGED Viewed

@@ -1,70 +1,31 @@
 {
   "additional_special_tokens": [
     {
-      "content": "<human>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false
     },
     {
-      "content": "<bot>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false
     },
     {
-      "content": "<img>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false
-    },
-    {
-      "content": "</img>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false
-    },
-    {
-      "content": "<vid>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false
-    },
-    {
-      "content": "</vid>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false
-    },
-    {
-      "content": "<box>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false
-    },
-    {
-      "content": "</box>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false
-    },
-    {
-      "content": "<ref>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false
-    },
-    {
-      "content": "</ref>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
@@ -85,7 +46,13 @@
     "rstrip": false,
     "single_word": false
   },
-  "pad_token": "<unk>",
   "unk_token": {
     "content": "<unk>",
     "lstrip": false,

 {
   "additional_special_tokens": [
+    "<human>",
+    "<bot>",
+    "<img>",
+    "</img>",
+    "<vid>",
+    "</vid>",
+    "<box>",
+    "</box>",
+    "<ref>",
+    "</ref>",
     {
+      "content": "<IMG_CONTEXT>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false
     },
     {
+      "content": "<quad>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false
     },
     {
+      "content": "</quad>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
     "rstrip": false,
     "single_word": false
   },
+  "pad_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
   "unk_token": {
     "content": "<unk>",
     "lstrip": false,

tokenizer_config.json CHANGED Viewed

@@ -105,6 +105,30 @@
       "rstrip": false,
       "single_word": false,
       "special": true
     }
   },
   "additional_special_tokens": [
@@ -117,47 +141,22 @@
     "<box>",
     "</box>",
     "<ref>",
-    "</ref>"
   ],
-  "bos_token": {
-    "__type": "AddedToken",
-    "content": "<s>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
   "clean_up_tokenization_spaces": false,
-  "eos_token": {
-    "__type": "AddedToken",
-    "content": "</s>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
   "legacy": true,
   "model_max_length": 768,
-  "pad_token": {
-    "__type": "AddedToken",
-    "content": "<unk>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
   "padding_side": "right",
   "sp_model_kwargs": {},
   "spaces_between_special_tokens": false,
   "tokenizer_class": "LlamaTokenizer",
-  "unk_token": {
-    "__type": "AddedToken",
-    "content": "<unk>",
-    "lstrip": false,
-    "normalized": true,
-    "rstrip": false,
-    "single_word": false
-  },
   "use_default_system_prompt": true,
   "use_fast": true
 }

       "rstrip": false,
       "single_word": false,
       "special": true
+    },
+    "41916": {
+      "content": "<IMG_CONTEXT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "41917": {
+      "content": "<quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "41918": {
+      "content": "</quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
     }
   },
   "additional_special_tokens": [
     "<box>",
     "</box>",
     "<ref>",
+    "</ref>",
+    "<IMG_CONTEXT>",
+    "<quad>",
+    "</quad>"
   ],
+  "bos_token": "<s>",
   "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
   "legacy": true,
   "model_max_length": 768,
+  "pad_token": "<unk>",
   "padding_side": "right",
   "sp_model_kwargs": {},
   "spaces_between_special_tokens": false,
   "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
   "use_default_system_prompt": true,
   "use_fast": true
 }