Upload folder using huggingface_hub
Browse files- configuration_intern_vit.py +117 -0
- configuration_internvl_chat.py +2 -107
- modeling_intern_vit.py +72 -1
- modeling_internvl_chat.py +1 -1
- preprocessor_config.json +1 -1
- special_tokens_map.json +20 -53
- tokenizer_config.json +32 -33
configuration_intern_vit.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# --------------------------------------------------------
|
2 |
+
# InternVL
|
3 |
+
# Copyright (c) 2023 OpenGVLab
|
4 |
+
# Licensed under The MIT License [see LICENSE for details]
|
5 |
+
# --------------------------------------------------------
|
6 |
+
import os
|
7 |
+
from typing import Union
|
8 |
+
|
9 |
+
from transformers.configuration_utils import PretrainedConfig
|
10 |
+
from transformers.utils import logging
|
11 |
+
|
12 |
+
logger = logging.get_logger(__name__)
|
13 |
+
|
14 |
+
|
15 |
+
class InternVisionConfig(PretrainedConfig):
|
16 |
+
r"""
|
17 |
+
This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
|
18 |
+
instantiate a vision encoder according to the specified arguments, defining the model architecture.
|
19 |
+
|
20 |
+
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
21 |
+
documentation from [`PretrainedConfig`] for more information.
|
22 |
+
|
23 |
+
Args:
|
24 |
+
num_channels (`int`, *optional*, defaults to 3):
|
25 |
+
Number of color channels in the input images (e.g., 3 for RGB).
|
26 |
+
patch_size (`int`, *optional*, defaults to 14):
|
27 |
+
The size (resolution) of each patch.
|
28 |
+
image_size (`int`, *optional*, defaults to 224):
|
29 |
+
The size (resolution) of each image.
|
30 |
+
qkv_bias (`bool`, *optional*, defaults to `False`):
|
31 |
+
Whether to add a bias to the queries and values in the self-attention layers.
|
32 |
+
hidden_size (`int`, *optional*, defaults to 3200):
|
33 |
+
Dimensionality of the encoder layers and the pooler layer.
|
34 |
+
num_attention_heads (`int`, *optional*, defaults to 25):
|
35 |
+
Number of attention heads for each attention layer in the Transformer encoder.
|
36 |
+
intermediate_size (`int`, *optional*, defaults to 12800):
|
37 |
+
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
38 |
+
qk_normalization (`bool`, *optional*, defaults to `True`):
|
39 |
+
Whether to normalize the queries and keys in the self-attention layers.
|
40 |
+
num_hidden_layers (`int`, *optional*, defaults to 48):
|
41 |
+
Number of hidden layers in the Transformer encoder.
|
42 |
+
use_flash_attn (`bool`, *optional*, defaults to `True`):
|
43 |
+
Whether to use flash attention mechanism.
|
44 |
+
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
|
45 |
+
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
46 |
+
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
|
47 |
+
layer_norm_eps (`float`, *optional*, defaults to 1e-6):
|
48 |
+
The epsilon used by the layer normalization layers.
|
49 |
+
dropout (`float`, *optional*, defaults to 0.0):
|
50 |
+
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
51 |
+
drop_path_rate (`float`, *optional*, defaults to 0.0):
|
52 |
+
Dropout rate for stochastic depth.
|
53 |
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
54 |
+
The dropout ratio for the attention probabilities.
|
55 |
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
56 |
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
57 |
+
initializer_factor (`float`, *optional*, defaults to 0.1):
|
58 |
+
A factor for layer scale.
|
59 |
+
"""
|
60 |
+
|
61 |
+
model_type = 'intern_vit_6b'
|
62 |
+
|
63 |
+
def __init__(
|
64 |
+
self,
|
65 |
+
num_channels=3,
|
66 |
+
patch_size=14,
|
67 |
+
image_size=224,
|
68 |
+
qkv_bias=False,
|
69 |
+
hidden_size=3200,
|
70 |
+
num_attention_heads=25,
|
71 |
+
intermediate_size=12800,
|
72 |
+
qk_normalization=True,
|
73 |
+
num_hidden_layers=48,
|
74 |
+
use_flash_attn=True,
|
75 |
+
hidden_act='gelu',
|
76 |
+
layer_norm_eps=1e-6,
|
77 |
+
dropout=0.0,
|
78 |
+
drop_path_rate=0.0,
|
79 |
+
attention_dropout=0.0,
|
80 |
+
initializer_range=0.02,
|
81 |
+
initializer_factor=0.1,
|
82 |
+
**kwargs,
|
83 |
+
):
|
84 |
+
super().__init__(**kwargs)
|
85 |
+
|
86 |
+
self.hidden_size = hidden_size
|
87 |
+
self.intermediate_size = intermediate_size
|
88 |
+
self.dropout = dropout
|
89 |
+
self.drop_path_rate = drop_path_rate
|
90 |
+
self.num_hidden_layers = num_hidden_layers
|
91 |
+
self.num_attention_heads = num_attention_heads
|
92 |
+
self.num_channels = num_channels
|
93 |
+
self.patch_size = patch_size
|
94 |
+
self.image_size = image_size
|
95 |
+
self.initializer_range = initializer_range
|
96 |
+
self.initializer_factor = initializer_factor
|
97 |
+
self.attention_dropout = attention_dropout
|
98 |
+
self.layer_norm_eps = layer_norm_eps
|
99 |
+
self.hidden_act = hidden_act
|
100 |
+
self.qkv_bias = qkv_bias
|
101 |
+
self.qk_normalization = qk_normalization
|
102 |
+
self.use_flash_attn = use_flash_attn
|
103 |
+
|
104 |
+
@classmethod
|
105 |
+
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
|
106 |
+
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
107 |
+
|
108 |
+
if 'vision_config' in config_dict:
|
109 |
+
config_dict = config_dict['vision_config']
|
110 |
+
|
111 |
+
if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
|
112 |
+
logger.warning(
|
113 |
+
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
114 |
+
f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
|
115 |
+
)
|
116 |
+
|
117 |
+
return cls.from_dict(config_dict, **kwargs)
|
configuration_internvl_chat.py
CHANGED
@@ -4,121 +4,16 @@
|
|
4 |
# Licensed under The MIT License [see LICENSE for details]
|
5 |
# --------------------------------------------------------
|
6 |
|
7 |
-
import os
|
8 |
import copy
|
9 |
-
from typing import Union
|
10 |
|
11 |
from transformers import LlamaConfig
|
12 |
from transformers.configuration_utils import PretrainedConfig
|
13 |
from transformers.utils import logging
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
class InternVisionConfig(PretrainedConfig):
|
19 |
-
r"""
|
20 |
-
This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
|
21 |
-
instantiate a vision encoder according to the specified arguments, defining the model architecture.
|
22 |
-
|
23 |
-
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
24 |
-
documentation from [`PretrainedConfig`] for more information.
|
25 |
|
26 |
-
Args:
|
27 |
-
num_channels (`int`, *optional*, defaults to 3):
|
28 |
-
Number of color channels in the input images (e.g., 3 for RGB).
|
29 |
-
patch_size (`int`, *optional*, defaults to 14):
|
30 |
-
The size (resolution) of each patch.
|
31 |
-
image_size (`int`, *optional*, defaults to 224):
|
32 |
-
The size (resolution) of each image.
|
33 |
-
qkv_bias (`bool`, *optional*, defaults to `False`):
|
34 |
-
Whether to add a bias to the queries and values in the self-attention layers.
|
35 |
-
hidden_size (`int`, *optional*, defaults to 3200):
|
36 |
-
Dimensionality of the encoder layers and the pooler layer.
|
37 |
-
num_attention_heads (`int`, *optional*, defaults to 25):
|
38 |
-
Number of attention heads for each attention layer in the Transformer encoder.
|
39 |
-
intermediate_size (`int`, *optional*, defaults to 12800):
|
40 |
-
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
41 |
-
qk_normalization (`bool`, *optional*, defaults to `True`):
|
42 |
-
Whether to normalize the queries and keys in the self-attention layers.
|
43 |
-
num_hidden_layers (`int`, *optional*, defaults to 48):
|
44 |
-
Number of hidden layers in the Transformer encoder.
|
45 |
-
use_flash_attn (`bool`, *optional*, defaults to `True`):
|
46 |
-
Whether to use flash attention mechanism.
|
47 |
-
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
|
48 |
-
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
49 |
-
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
|
50 |
-
layer_norm_eps (`float`, *optional*, defaults to 1e-6):
|
51 |
-
The epsilon used by the layer normalization layers.
|
52 |
-
dropout (`float`, *optional*, defaults to 0.0):
|
53 |
-
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
54 |
-
drop_path_rate (`float`, *optional*, defaults to 0.0):
|
55 |
-
Dropout rate for stochastic depth.
|
56 |
-
attention_dropout (`float`, *optional*, defaults to 0.0):
|
57 |
-
The dropout ratio for the attention probabilities.
|
58 |
-
initializer_range (`float`, *optional*, defaults to 0.02):
|
59 |
-
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
60 |
-
initializer_factor (`float`, *optional*, defaults to 0.1):
|
61 |
-
A factor for layer scale.
|
62 |
-
"""
|
63 |
-
|
64 |
-
model_type = 'intern_vit_6b'
|
65 |
-
|
66 |
-
def __init__(
|
67 |
-
self,
|
68 |
-
num_channels=3,
|
69 |
-
patch_size=14,
|
70 |
-
image_size=224,
|
71 |
-
qkv_bias=False,
|
72 |
-
hidden_size=3200,
|
73 |
-
num_attention_heads=25,
|
74 |
-
intermediate_size=12800,
|
75 |
-
qk_normalization=True,
|
76 |
-
num_hidden_layers=48,
|
77 |
-
use_flash_attn=True,
|
78 |
-
hidden_act='gelu',
|
79 |
-
layer_norm_eps=1e-6,
|
80 |
-
dropout=0.0,
|
81 |
-
drop_path_rate=0.0,
|
82 |
-
attention_dropout=0.0,
|
83 |
-
initializer_range=0.02,
|
84 |
-
initializer_factor=0.1,
|
85 |
-
**kwargs,
|
86 |
-
):
|
87 |
-
super().__init__(**kwargs)
|
88 |
-
|
89 |
-
self.hidden_size = hidden_size
|
90 |
-
self.intermediate_size = intermediate_size
|
91 |
-
self.dropout = dropout
|
92 |
-
self.drop_path_rate = drop_path_rate
|
93 |
-
self.num_hidden_layers = num_hidden_layers
|
94 |
-
self.num_attention_heads = num_attention_heads
|
95 |
-
self.num_channels = num_channels
|
96 |
-
self.patch_size = patch_size
|
97 |
-
self.image_size = image_size
|
98 |
-
self.initializer_range = initializer_range
|
99 |
-
self.initializer_factor = initializer_factor
|
100 |
-
self.attention_dropout = attention_dropout
|
101 |
-
self.layer_norm_eps = layer_norm_eps
|
102 |
-
self.hidden_act = hidden_act
|
103 |
-
self.qkv_bias = qkv_bias
|
104 |
-
self.qk_normalization = qk_normalization
|
105 |
-
self.use_flash_attn = use_flash_attn
|
106 |
-
|
107 |
-
@classmethod
|
108 |
-
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
|
109 |
-
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
110 |
-
|
111 |
-
if 'vision_config' in config_dict:
|
112 |
-
config_dict = config_dict['vision_config']
|
113 |
-
|
114 |
-
if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
|
115 |
-
logger.warning(
|
116 |
-
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
117 |
-
f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
|
118 |
-
)
|
119 |
-
|
120 |
-
return cls.from_dict(config_dict, **kwargs)
|
121 |
|
|
|
122 |
|
123 |
|
124 |
class InternVLChatConfig(PretrainedConfig):
|
|
|
4 |
# Licensed under The MIT License [see LICENSE for details]
|
5 |
# --------------------------------------------------------
|
6 |
|
|
|
7 |
import copy
|
|
|
8 |
|
9 |
from transformers import LlamaConfig
|
10 |
from transformers.configuration_utils import PretrainedConfig
|
11 |
from transformers.utils import logging
|
12 |
|
13 |
+
from .configuration_intern_vit import InternVisionConfig
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
+
logger = logging.get_logger(__name__)
|
17 |
|
18 |
|
19 |
class InternVLChatConfig(PretrainedConfig):
|
modeling_intern_vit.py
CHANGED
@@ -20,7 +20,13 @@ from transformers.utils import logging
|
|
20 |
from .configuration_intern_vit import InternVisionConfig
|
21 |
|
22 |
try:
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
has_flash_attn = True
|
25 |
except:
|
26 |
print('FlashAttention is not installed.')
|
@@ -30,6 +36,70 @@ except:
|
|
30 |
logger = logging.get_logger(__name__)
|
31 |
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
class InternRMSNorm(nn.Module):
|
34 |
def __init__(self, hidden_size, eps=1e-6):
|
35 |
super().__init__()
|
@@ -279,6 +349,7 @@ class InternVisionEncoder(nn.Module):
|
|
279 |
class InternVisionModel(PreTrainedModel):
|
280 |
main_input_name = 'pixel_values'
|
281 |
config_class = InternVisionConfig
|
|
|
282 |
|
283 |
def __init__(self, config: InternVisionConfig):
|
284 |
super().__init__(config)
|
|
|
20 |
from .configuration_intern_vit import InternVisionConfig
|
21 |
|
22 |
try:
|
23 |
+
try: # v1
|
24 |
+
from flash_attn.flash_attn_interface import \
|
25 |
+
flash_attn_unpadded_qkvpacked_func
|
26 |
+
except: # v2
|
27 |
+
from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
|
28 |
+
|
29 |
+
from flash_attn.bert_padding import pad_input, unpad_input
|
30 |
has_flash_attn = True
|
31 |
except:
|
32 |
print('FlashAttention is not installed.')
|
|
|
36 |
logger = logging.get_logger(__name__)
|
37 |
|
38 |
|
39 |
+
class FlashAttention(nn.Module):
|
40 |
+
"""Implement the scaled dot product attention with softmax.
|
41 |
+
Arguments
|
42 |
+
---------
|
43 |
+
softmax_scale: The temperature to use for the softmax attention.
|
44 |
+
(default: 1/sqrt(d_keys) where d_keys is computed at
|
45 |
+
runtime)
|
46 |
+
attention_dropout: The dropout rate to apply to the attention
|
47 |
+
(default: 0.0)
|
48 |
+
"""
|
49 |
+
|
50 |
+
def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
|
51 |
+
super().__init__()
|
52 |
+
self.softmax_scale = softmax_scale
|
53 |
+
self.dropout_p = attention_dropout
|
54 |
+
|
55 |
+
def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None,
|
56 |
+
max_s=None, need_weights=False):
|
57 |
+
"""Implements the multihead softmax attention.
|
58 |
+
Arguments
|
59 |
+
---------
|
60 |
+
qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
|
61 |
+
if unpadded: (nnz, 3, h, d)
|
62 |
+
key_padding_mask: a bool tensor of shape (B, S)
|
63 |
+
"""
|
64 |
+
assert not need_weights
|
65 |
+
assert qkv.dtype in [torch.float16, torch.bfloat16]
|
66 |
+
assert qkv.is_cuda
|
67 |
+
|
68 |
+
if cu_seqlens is None:
|
69 |
+
batch_size = qkv.shape[0]
|
70 |
+
seqlen = qkv.shape[1]
|
71 |
+
if key_padding_mask is None:
|
72 |
+
qkv = rearrange(qkv, 'b s ... -> (b s) ...')
|
73 |
+
max_s = seqlen
|
74 |
+
cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
|
75 |
+
device=qkv.device)
|
76 |
+
output = flash_attn_unpadded_qkvpacked_func(
|
77 |
+
qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
|
78 |
+
softmax_scale=self.softmax_scale, causal=causal
|
79 |
+
)
|
80 |
+
output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
|
81 |
+
else:
|
82 |
+
nheads = qkv.shape[-2]
|
83 |
+
x = rearrange(qkv, 'b s three h d -> b s (three h d)')
|
84 |
+
x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
|
85 |
+
x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
|
86 |
+
output_unpad = flash_attn_unpadded_qkvpacked_func(
|
87 |
+
x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
|
88 |
+
softmax_scale=self.softmax_scale, causal=causal
|
89 |
+
)
|
90 |
+
output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
|
91 |
+
indices, batch_size, seqlen),
|
92 |
+
'b s (h d) -> b s h d', h=nheads)
|
93 |
+
else:
|
94 |
+
assert max_s is not None
|
95 |
+
output = flash_attn_unpadded_qkvpacked_func(
|
96 |
+
qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
|
97 |
+
softmax_scale=self.softmax_scale, causal=causal
|
98 |
+
)
|
99 |
+
|
100 |
+
return output, None
|
101 |
+
|
102 |
+
|
103 |
class InternRMSNorm(nn.Module):
|
104 |
def __init__(self, hidden_size, eps=1e-6):
|
105 |
super().__init__()
|
|
|
349 |
class InternVisionModel(PreTrainedModel):
|
350 |
main_input_name = 'pixel_values'
|
351 |
config_class = InternVisionConfig
|
352 |
+
_no_split_modules = ['InternAttention']
|
353 |
|
354 |
def __init__(self, config: InternVisionConfig):
|
355 |
super().__init__(config)
|
modeling_internvl_chat.py
CHANGED
@@ -23,6 +23,7 @@ logger = logging.get_logger(__name__)
|
|
23 |
class InternVLChatModel(PreTrainedModel):
|
24 |
config_class = InternVLChatConfig
|
25 |
main_input_name = 'pixel_values'
|
|
|
26 |
|
27 |
def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
|
28 |
super().__init__(config)
|
@@ -193,7 +194,6 @@ class InternVLChatModel(PreTrainedModel):
|
|
193 |
|
194 |
img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
|
195 |
self.img_context_token_id = img_context_token_id
|
196 |
-
|
197 |
from .conversation import get_conv_template
|
198 |
|
199 |
template = get_conv_template(self.template)
|
|
|
23 |
class InternVLChatModel(PreTrainedModel):
|
24 |
config_class = InternVLChatConfig
|
25 |
main_input_name = 'pixel_values'
|
26 |
+
_no_split_modules = ['InternAttention', 'LlamaDecoderLayer', 'LlamaForCausalLM']
|
27 |
|
28 |
def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
|
29 |
super().__init__(config)
|
|
|
194 |
|
195 |
img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
|
196 |
self.img_context_token_id = img_context_token_id
|
|
|
197 |
from .conversation import get_conv_template
|
198 |
|
199 |
template = get_conv_template(self.template)
|
preprocessor_config.json
CHANGED
@@ -16,4 +16,4 @@
|
|
16 |
],
|
17 |
"resample": 3,
|
18 |
"size": 448
|
19 |
-
}
|
|
|
16 |
],
|
17 |
"resample": 3,
|
18 |
"size": 448
|
19 |
+
}
|
special_tokens_map.json
CHANGED
@@ -1,70 +1,31 @@
|
|
1 |
{
|
2 |
"additional_special_tokens": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
{
|
4 |
-
"content": "<
|
5 |
"lstrip": false,
|
6 |
"normalized": false,
|
7 |
"rstrip": false,
|
8 |
"single_word": false
|
9 |
},
|
10 |
{
|
11 |
-
"content": "<
|
12 |
"lstrip": false,
|
13 |
"normalized": false,
|
14 |
"rstrip": false,
|
15 |
"single_word": false
|
16 |
},
|
17 |
{
|
18 |
-
"content": "
|
19 |
-
"lstrip": false,
|
20 |
-
"normalized": false,
|
21 |
-
"rstrip": false,
|
22 |
-
"single_word": false
|
23 |
-
},
|
24 |
-
{
|
25 |
-
"content": "</img>",
|
26 |
-
"lstrip": false,
|
27 |
-
"normalized": false,
|
28 |
-
"rstrip": false,
|
29 |
-
"single_word": false
|
30 |
-
},
|
31 |
-
{
|
32 |
-
"content": "<vid>",
|
33 |
-
"lstrip": false,
|
34 |
-
"normalized": false,
|
35 |
-
"rstrip": false,
|
36 |
-
"single_word": false
|
37 |
-
},
|
38 |
-
{
|
39 |
-
"content": "</vid>",
|
40 |
-
"lstrip": false,
|
41 |
-
"normalized": false,
|
42 |
-
"rstrip": false,
|
43 |
-
"single_word": false
|
44 |
-
},
|
45 |
-
{
|
46 |
-
"content": "<box>",
|
47 |
-
"lstrip": false,
|
48 |
-
"normalized": false,
|
49 |
-
"rstrip": false,
|
50 |
-
"single_word": false
|
51 |
-
},
|
52 |
-
{
|
53 |
-
"content": "</box>",
|
54 |
-
"lstrip": false,
|
55 |
-
"normalized": false,
|
56 |
-
"rstrip": false,
|
57 |
-
"single_word": false
|
58 |
-
},
|
59 |
-
{
|
60 |
-
"content": "<ref>",
|
61 |
-
"lstrip": false,
|
62 |
-
"normalized": false,
|
63 |
-
"rstrip": false,
|
64 |
-
"single_word": false
|
65 |
-
},
|
66 |
-
{
|
67 |
-
"content": "</ref>",
|
68 |
"lstrip": false,
|
69 |
"normalized": false,
|
70 |
"rstrip": false,
|
@@ -85,7 +46,13 @@
|
|
85 |
"rstrip": false,
|
86 |
"single_word": false
|
87 |
},
|
88 |
-
"pad_token":
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
"unk_token": {
|
90 |
"content": "<unk>",
|
91 |
"lstrip": false,
|
|
|
1 |
{
|
2 |
"additional_special_tokens": [
|
3 |
+
"<human>",
|
4 |
+
"<bot>",
|
5 |
+
"<img>",
|
6 |
+
"</img>",
|
7 |
+
"<vid>",
|
8 |
+
"</vid>",
|
9 |
+
"<box>",
|
10 |
+
"</box>",
|
11 |
+
"<ref>",
|
12 |
+
"</ref>",
|
13 |
{
|
14 |
+
"content": "<IMG_CONTEXT>",
|
15 |
"lstrip": false,
|
16 |
"normalized": false,
|
17 |
"rstrip": false,
|
18 |
"single_word": false
|
19 |
},
|
20 |
{
|
21 |
+
"content": "<quad>",
|
22 |
"lstrip": false,
|
23 |
"normalized": false,
|
24 |
"rstrip": false,
|
25 |
"single_word": false
|
26 |
},
|
27 |
{
|
28 |
+
"content": "</quad>",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
"lstrip": false,
|
30 |
"normalized": false,
|
31 |
"rstrip": false,
|
|
|
46 |
"rstrip": false,
|
47 |
"single_word": false
|
48 |
},
|
49 |
+
"pad_token": {
|
50 |
+
"content": "<unk>",
|
51 |
+
"lstrip": false,
|
52 |
+
"normalized": false,
|
53 |
+
"rstrip": false,
|
54 |
+
"single_word": false
|
55 |
+
},
|
56 |
"unk_token": {
|
57 |
"content": "<unk>",
|
58 |
"lstrip": false,
|
tokenizer_config.json
CHANGED
@@ -105,6 +105,30 @@
|
|
105 |
"rstrip": false,
|
106 |
"single_word": false,
|
107 |
"special": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
}
|
109 |
},
|
110 |
"additional_special_tokens": [
|
@@ -117,47 +141,22 @@
|
|
117 |
"<box>",
|
118 |
"</box>",
|
119 |
"<ref>",
|
120 |
-
"</ref>"
|
|
|
|
|
|
|
121 |
],
|
122 |
-
"bos_token":
|
123 |
-
"__type": "AddedToken",
|
124 |
-
"content": "<s>",
|
125 |
-
"lstrip": false,
|
126 |
-
"normalized": true,
|
127 |
-
"rstrip": false,
|
128 |
-
"single_word": false
|
129 |
-
},
|
130 |
"clean_up_tokenization_spaces": false,
|
131 |
-
"eos_token":
|
132 |
-
"__type": "AddedToken",
|
133 |
-
"content": "</s>",
|
134 |
-
"lstrip": false,
|
135 |
-
"normalized": true,
|
136 |
-
"rstrip": false,
|
137 |
-
"single_word": false
|
138 |
-
},
|
139 |
"legacy": true,
|
140 |
"model_max_length": 768,
|
141 |
-
"pad_token":
|
142 |
-
"__type": "AddedToken",
|
143 |
-
"content": "<unk>",
|
144 |
-
"lstrip": false,
|
145 |
-
"normalized": true,
|
146 |
-
"rstrip": false,
|
147 |
-
"single_word": false
|
148 |
-
},
|
149 |
"padding_side": "right",
|
150 |
"sp_model_kwargs": {},
|
151 |
"spaces_between_special_tokens": false,
|
152 |
"tokenizer_class": "LlamaTokenizer",
|
153 |
-
"unk_token":
|
154 |
-
"__type": "AddedToken",
|
155 |
-
"content": "<unk>",
|
156 |
-
"lstrip": false,
|
157 |
-
"normalized": true,
|
158 |
-
"rstrip": false,
|
159 |
-
"single_word": false
|
160 |
-
},
|
161 |
"use_default_system_prompt": true,
|
162 |
"use_fast": true
|
163 |
}
|
|
|
105 |
"rstrip": false,
|
106 |
"single_word": false,
|
107 |
"special": true
|
108 |
+
},
|
109 |
+
"41916": {
|
110 |
+
"content": "<IMG_CONTEXT>",
|
111 |
+
"lstrip": false,
|
112 |
+
"normalized": false,
|
113 |
+
"rstrip": false,
|
114 |
+
"single_word": false,
|
115 |
+
"special": true
|
116 |
+
},
|
117 |
+
"41917": {
|
118 |
+
"content": "<quad>",
|
119 |
+
"lstrip": false,
|
120 |
+
"normalized": false,
|
121 |
+
"rstrip": false,
|
122 |
+
"single_word": false,
|
123 |
+
"special": true
|
124 |
+
},
|
125 |
+
"41918": {
|
126 |
+
"content": "</quad>",
|
127 |
+
"lstrip": false,
|
128 |
+
"normalized": false,
|
129 |
+
"rstrip": false,
|
130 |
+
"single_word": false,
|
131 |
+
"special": true
|
132 |
}
|
133 |
},
|
134 |
"additional_special_tokens": [
|
|
|
141 |
"<box>",
|
142 |
"</box>",
|
143 |
"<ref>",
|
144 |
+
"</ref>",
|
145 |
+
"<IMG_CONTEXT>",
|
146 |
+
"<quad>",
|
147 |
+
"</quad>"
|
148 |
],
|
149 |
+
"bos_token": "<s>",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
"clean_up_tokenization_spaces": false,
|
151 |
+
"eos_token": "</s>",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
"legacy": true,
|
153 |
"model_max_length": 768,
|
154 |
+
"pad_token": "<unk>",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
"padding_side": "right",
|
156 |
"sp_model_kwargs": {},
|
157 |
"spaces_between_special_tokens": false,
|
158 |
"tokenizer_class": "LlamaTokenizer",
|
159 |
+
"unk_token": "<unk>",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
"use_default_system_prompt": true,
|
161 |
"use_fast": true
|
162 |
}
|