Mountchicken commited on
Commit
692ce93
·
verified ·
1 Parent(s): 57db6e5

Upload 16 files

Browse files
added_tokens.json ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</ground>": 151766,
3
+ "</objects>": 151768,
4
+ "</tool_call>": 151658,
5
+ "<ground>": 151765,
6
+ "<obj0>": 151665,
7
+ "<obj10>": 151675,
8
+ "<obj11>": 151676,
9
+ "<obj12>": 151677,
10
+ "<obj13>": 151678,
11
+ "<obj14>": 151679,
12
+ "<obj15>": 151680,
13
+ "<obj16>": 151681,
14
+ "<obj17>": 151682,
15
+ "<obj18>": 151683,
16
+ "<obj19>": 151684,
17
+ "<obj1>": 151666,
18
+ "<obj20>": 151685,
19
+ "<obj21>": 151686,
20
+ "<obj22>": 151687,
21
+ "<obj23>": 151688,
22
+ "<obj24>": 151689,
23
+ "<obj25>": 151690,
24
+ "<obj26>": 151691,
25
+ "<obj27>": 151692,
26
+ "<obj28>": 151693,
27
+ "<obj29>": 151694,
28
+ "<obj2>": 151667,
29
+ "<obj30>": 151695,
30
+ "<obj31>": 151696,
31
+ "<obj32>": 151697,
32
+ "<obj33>": 151698,
33
+ "<obj34>": 151699,
34
+ "<obj35>": 151700,
35
+ "<obj36>": 151701,
36
+ "<obj37>": 151702,
37
+ "<obj38>": 151703,
38
+ "<obj39>": 151704,
39
+ "<obj3>": 151668,
40
+ "<obj40>": 151705,
41
+ "<obj41>": 151706,
42
+ "<obj42>": 151707,
43
+ "<obj43>": 151708,
44
+ "<obj44>": 151709,
45
+ "<obj45>": 151710,
46
+ "<obj46>": 151711,
47
+ "<obj47>": 151712,
48
+ "<obj48>": 151713,
49
+ "<obj49>": 151714,
50
+ "<obj4>": 151669,
51
+ "<obj50>": 151715,
52
+ "<obj51>": 151716,
53
+ "<obj52>": 151717,
54
+ "<obj53>": 151718,
55
+ "<obj54>": 151719,
56
+ "<obj55>": 151720,
57
+ "<obj56>": 151721,
58
+ "<obj57>": 151722,
59
+ "<obj58>": 151723,
60
+ "<obj59>": 151724,
61
+ "<obj5>": 151670,
62
+ "<obj60>": 151725,
63
+ "<obj61>": 151726,
64
+ "<obj62>": 151727,
65
+ "<obj63>": 151728,
66
+ "<obj64>": 151729,
67
+ "<obj65>": 151730,
68
+ "<obj66>": 151731,
69
+ "<obj67>": 151732,
70
+ "<obj68>": 151733,
71
+ "<obj69>": 151734,
72
+ "<obj6>": 151671,
73
+ "<obj70>": 151735,
74
+ "<obj71>": 151736,
75
+ "<obj72>": 151737,
76
+ "<obj73>": 151738,
77
+ "<obj74>": 151739,
78
+ "<obj75>": 151740,
79
+ "<obj76>": 151741,
80
+ "<obj77>": 151742,
81
+ "<obj78>": 151743,
82
+ "<obj79>": 151744,
83
+ "<obj7>": 151672,
84
+ "<obj80>": 151745,
85
+ "<obj81>": 151746,
86
+ "<obj82>": 151747,
87
+ "<obj83>": 151748,
88
+ "<obj84>": 151749,
89
+ "<obj85>": 151750,
90
+ "<obj86>": 151751,
91
+ "<obj87>": 151752,
92
+ "<obj88>": 151753,
93
+ "<obj89>": 151754,
94
+ "<obj8>": 151673,
95
+ "<obj90>": 151755,
96
+ "<obj91>": 151756,
97
+ "<obj92>": 151757,
98
+ "<obj93>": 151758,
99
+ "<obj94>": 151759,
100
+ "<obj95>": 151760,
101
+ "<obj96>": 151761,
102
+ "<obj97>": 151762,
103
+ "<obj98>": 151763,
104
+ "<obj99>": 151764,
105
+ "<obj9>": 151674,
106
+ "<objects>": 151767,
107
+ "<tool_call>": 151657,
108
+ "<|box_end|>": 151649,
109
+ "<|box_start|>": 151648,
110
+ "<|endoftext|>": 151643,
111
+ "<|file_sep|>": 151664,
112
+ "<|fim_middle|>": 151660,
113
+ "<|fim_pad|>": 151662,
114
+ "<|fim_prefix|>": 151659,
115
+ "<|fim_suffix|>": 151661,
116
+ "<|im_end|>": 151645,
117
+ "<|im_start|>": 151644,
118
+ "<|image_pad|>": 151655,
119
+ "<|object_ref_end|>": 151647,
120
+ "<|object_ref_start|>": 151646,
121
+ "<|quad_end|>": 151651,
122
+ "<|quad_start|>": 151650,
123
+ "<|repo_name|>": 151663,
124
+ "<|video_pad|>": 151656,
125
+ "<|vision_end|>": 151653,
126
+ "<|vision_pad|>": 151654,
127
+ "<|vision_start|>": 151652
128
+ }
clip.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from transformers import CLIPImageProcessor, CLIPVisionConfig, CLIPVisionModel
4
+
5
+
6
+ class CLIPVisionTower(nn.Module):
7
+ def __init__(self, vision_tower, args, freeze_vision_tower=False, delay_load=False):
8
+ super().__init__()
9
+
10
+ self.is_loaded = False
11
+
12
+ self.vision_tower_name = vision_tower
13
+ self.select_layer = args.mm_vision_select_layer
14
+ self.select_feature = getattr(args, "mm_vision_select_feature", "patch")
15
+ self.freeze_vision_tower = freeze_vision_tower
16
+ if not delay_load:
17
+ self.load_model()
18
+ elif getattr(args, "unfreeze_mm_vision_tower", False):
19
+ self.load_model()
20
+ else:
21
+ self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
22
+
23
+ def load_model(self, device_map=None):
24
+ if self.is_loaded:
25
+ print(
26
+ "{} is already loaded, `load_model` called again, skipping.".format(
27
+ self.vision_tower_name
28
+ )
29
+ )
30
+ return
31
+
32
+ self.image_processor = CLIPImageProcessor.from_pretrained(
33
+ self.vision_tower_name
34
+ )
35
+ self.vision_tower = CLIPVisionModel.from_pretrained(
36
+ self.vision_tower_name, device_map=device_map
37
+ )
38
+
39
+ if self.freeze_vision_tower:
40
+ self.vision_tower.requires_grad_(False)
41
+
42
+ self.is_loaded = True
43
+
44
+ def feature_select(self, image_forward_outs):
45
+ image_features = image_forward_outs.hidden_states[self.select_layer]
46
+ if self.select_feature == "patch":
47
+ image_features = image_features[:, 1:]
48
+ elif self.select_feature == "cls_patch":
49
+ image_features = image_features
50
+ else:
51
+ raise ValueError(f"Unexpected select feature: {self.select_feature}")
52
+ return image_features
53
+
54
+ def forward(self, images):
55
+ if type(images) is list:
56
+ image_features = []
57
+ for image in images:
58
+ if self.freeze_vision_tower:
59
+ with torch.no_grad():
60
+ image_forward_out = self.vision_tower(
61
+ image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
62
+ output_hidden_states=True,
63
+ )
64
+ image_feature = self.feature_select(image_forward_out).to(
65
+ image.dtype
66
+ )
67
+ image_features.append(image_feature)
68
+ else:
69
+ image_forward_out = self.vision_tower(
70
+ image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
71
+ output_hidden_states=True,
72
+ )
73
+ image_feature = self.feature_select(image_forward_out).to(
74
+ image.dtype
75
+ )
76
+ image_features.append(image_feature)
77
+ else:
78
+ if self.freeze_vision_tower:
79
+ with torch.no_grad():
80
+ image_forward_out = self.vision_tower(
81
+ images.to(device=self.device, dtype=self.dtype),
82
+ output_hidden_states=True,
83
+ )
84
+ image_features = self.feature_select(image_forward_out).to(
85
+ images.dtype
86
+ )
87
+ else:
88
+ image_forward_outs = self.vision_tower(
89
+ images.to(device=self.device, dtype=self.dtype),
90
+ output_hidden_states=True,
91
+ )
92
+ image_features = self.feature_select(image_forward_outs).to(
93
+ images.dtype
94
+ )
95
+
96
+ return image_features
97
+
98
+ @property
99
+ def dummy_feature(self):
100
+ return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
101
+
102
+ @property
103
+ def dtype(self):
104
+ return self.vision_tower.dtype
105
+
106
+ @property
107
+ def device(self):
108
+ return self.vision_tower.device
109
+
110
+ @property
111
+ def config(self):
112
+ if self.is_loaded:
113
+ return self.vision_tower.config
114
+ else:
115
+ return self.cfg_only
116
+
117
+ @property
118
+ def hidden_size(self):
119
+ return self.config.hidden_size
120
+
121
+ @property
122
+ def num_patches_per_side(self):
123
+ return self.config.image_size // self.config.patch_size
124
+
125
+ @property
126
+ def num_patches(self):
127
+ return (self.config.image_size // self.config.patch_size) ** 2
config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RexSeekQwenForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "modeling_rexseek.RexSeekQwenConfig",
7
+ "AutoModelForCausalLM": "modeling_rexseek.RexSeekQwenForCausalLM"
8
+ },
9
+ "attention_dropout": 0.0,
10
+ "bos_token_id": 151643,
11
+ "eos_token_id": 151645,
12
+ "freeze_mm_mlp_adapter": false,
13
+ "hidden_act": "silu",
14
+ "hidden_size": 2048,
15
+ "image_aspect_ratio": "pad",
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 11008,
18
+ "max_position_embeddings": 32768,
19
+ "max_window_layers": 70,
20
+ "mm_hidden_size": 2560,
21
+ "mm_patch_merge_type": "flat",
22
+ "mm_projector_lr": null,
23
+ "mm_projector_type": "mlp2x_gelu",
24
+ "mm_vision_select_feature": "patch",
25
+ "mm_vision_select_layer": -2,
26
+ "mm_vision_tower": "openai/clip-vit-large-patch14-336",
27
+ "model_type": "rexseek_qwen",
28
+ "num_attention_heads": 16,
29
+ "num_hidden_layers": 36,
30
+ "num_key_value_heads": 2,
31
+ "object_hidden_size": 2880,
32
+ "rms_norm_eps": 1e-06,
33
+ "rope_scaling": null,
34
+ "rope_theta": 1000000.0,
35
+ "sliding_window": null,
36
+ "tie_word_embeddings": true,
37
+ "tokenizer_model_max_length": 2048,
38
+ "tokenizer_padding_side": "right",
39
+ "torch_dtype": "bfloat16",
40
+ "transformers_version": "4.48.0",
41
+ "use_cache": true,
42
+ "use_mm_proj": true,
43
+ "use_sliding_window": false,
44
+ "vis_during_training_prob": 0.0,
45
+ "vocab_size": 151769
46
+ }
convnext.py ADDED
@@ -0,0 +1,697 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import partial
2
+ from typing import Callable, List, Optional, Tuple, Union
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+ from open_clip.factory import get_model_config
7
+ from open_clip.model import CLIPVisionCfg
8
+ from timm.layers import (
9
+ AvgPool2dSame,
10
+ ClassifierHead,
11
+ DropPath,
12
+ GlobalResponseNormMlp,
13
+ LayerNorm,
14
+ LayerNorm2d,
15
+ Mlp,
16
+ NormMlpClassifierHead,
17
+ create_conv2d,
18
+ get_act_layer,
19
+ make_divisible,
20
+ to_ntuple,
21
+ trunc_normal_,
22
+ )
23
+ from timm.models._builder import build_model_with_cfg
24
+ from timm.models._features import feature_take_indices
25
+ from timm.models._manipulate import checkpoint_seq, named_apply
26
+
27
+ __all__ = ["ConvNeXt"] # model_registry will add each entrypoint fn to this
28
+
29
+
30
+ class Downsample(nn.Module):
31
+
32
+ def __init__(self, in_chs, out_chs, stride=1, dilation=1):
33
+ super().__init__()
34
+ avg_stride = stride if dilation == 1 else 1
35
+ if stride > 1 or dilation > 1:
36
+ avg_pool_fn = (
37
+ AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
38
+ )
39
+ self.pool = avg_pool_fn(
40
+ 2, avg_stride, ceil_mode=True, count_include_pad=False
41
+ )
42
+ else:
43
+ self.pool = nn.Identity()
44
+
45
+ if in_chs != out_chs:
46
+ self.conv = create_conv2d(in_chs, out_chs, 1, stride=1)
47
+ else:
48
+ self.conv = nn.Identity()
49
+
50
+ def forward(self, x):
51
+ x = self.pool(x)
52
+ x = self.conv(x)
53
+ return x
54
+
55
+
56
+ class ConvNeXtBlock(nn.Module):
57
+ """ConvNeXt Block
58
+ There are two equivalent implementations:
59
+ (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
60
+ (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
61
+
62
+ Unlike the official impl, this one allows choice of 1 or 2, 1x1 conv can be faster with appropriate
63
+ choice of LayerNorm impl, however as model size increases the tradeoffs appear to change and nn.Linear
64
+ is a better choice. This was observed with PyTorch 1.10 on 3090 GPU, it could change over time & w/ different HW.
65
+ """
66
+
67
+ def __init__(
68
+ self,
69
+ in_chs: int,
70
+ out_chs: Optional[int] = None,
71
+ kernel_size: int = 7,
72
+ stride: int = 1,
73
+ dilation: Union[int, Tuple[int, int]] = (1, 1),
74
+ mlp_ratio: float = 4,
75
+ conv_mlp: bool = False,
76
+ conv_bias: bool = True,
77
+ use_grn: bool = False,
78
+ ls_init_value: Optional[float] = 1e-6,
79
+ act_layer: Union[str, Callable] = "gelu",
80
+ norm_layer: Optional[Callable] = None,
81
+ drop_path: float = 0.0,
82
+ ):
83
+ """
84
+
85
+ Args:
86
+ in_chs: Block input channels.
87
+ out_chs: Block output channels (same as in_chs if None).
88
+ kernel_size: Depthwise convolution kernel size.
89
+ stride: Stride of depthwise convolution.
90
+ dilation: Tuple specifying input and output dilation of block.
91
+ mlp_ratio: MLP expansion ratio.
92
+ conv_mlp: Use 1x1 convolutions for MLP and a NCHW compatible norm layer if True.
93
+ conv_bias: Apply bias for all convolution (linear) layers.
94
+ use_grn: Use GlobalResponseNorm in MLP (from ConvNeXt-V2)
95
+ ls_init_value: Layer-scale init values, layer-scale applied if not None.
96
+ act_layer: Activation layer.
97
+ norm_layer: Normalization layer (defaults to LN if not specified).
98
+ drop_path: Stochastic depth probability.
99
+ """
100
+ super().__init__()
101
+ out_chs = out_chs or in_chs
102
+ dilation = to_ntuple(2)(dilation)
103
+ act_layer = get_act_layer(act_layer)
104
+ if not norm_layer:
105
+ norm_layer = LayerNorm2d if conv_mlp else LayerNorm
106
+ mlp_layer = partial(
107
+ GlobalResponseNormMlp if use_grn else Mlp, use_conv=conv_mlp
108
+ )
109
+ self.use_conv_mlp = conv_mlp
110
+ self.conv_dw = create_conv2d(
111
+ in_chs,
112
+ out_chs,
113
+ kernel_size=kernel_size,
114
+ stride=stride,
115
+ dilation=dilation[0],
116
+ depthwise=True,
117
+ bias=conv_bias,
118
+ )
119
+ self.norm = norm_layer(out_chs)
120
+ self.mlp = mlp_layer(out_chs, int(mlp_ratio * out_chs), act_layer=act_layer)
121
+ self.ramma = (
122
+ nn.Parameter(ls_init_value * torch.ones(out_chs))
123
+ if ls_init_value is not None
124
+ else None
125
+ )
126
+ if in_chs != out_chs or stride != 1 or dilation[0] != dilation[1]:
127
+ self.shortcut = Downsample(
128
+ in_chs, out_chs, stride=stride, dilation=dilation[0]
129
+ )
130
+ else:
131
+ self.shortcut = nn.Identity()
132
+ self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
133
+
134
+ def forward(self, x):
135
+ shortcut = x
136
+ x = self.conv_dw(x)
137
+ if self.use_conv_mlp:
138
+ x = self.norm(x)
139
+ x = self.mlp(x)
140
+ else:
141
+ x = x.permute(0, 2, 3, 1)
142
+ x = self.norm(x)
143
+ x = self.mlp(x)
144
+ x = x.permute(0, 3, 1, 2)
145
+ if self.ramma is not None:
146
+ x = x.mul(self.ramma.reshape(1, -1, 1, 1))
147
+
148
+ x = self.drop_path(x) + self.shortcut(shortcut)
149
+ return x
150
+
151
+
152
+ class ConvNeXtStage(nn.Module):
153
+
154
+ def __init__(
155
+ self,
156
+ in_chs,
157
+ out_chs,
158
+ kernel_size=7,
159
+ stride=2,
160
+ depth=2,
161
+ dilation=(1, 1),
162
+ drop_path_rates=None,
163
+ ls_init_value=1.0,
164
+ conv_mlp=False,
165
+ conv_bias=True,
166
+ use_grn=False,
167
+ act_layer="gelu",
168
+ norm_layer=None,
169
+ norm_layer_cl=None,
170
+ ):
171
+ super().__init__()
172
+ self.grad_checkpointing = False
173
+
174
+ if in_chs != out_chs or stride > 1 or dilation[0] != dilation[1]:
175
+ ds_ks = 2 if stride > 1 or dilation[0] != dilation[1] else 1
176
+ pad = (
177
+ "same" if dilation[1] > 1 else 0
178
+ ) # same padding needed if dilation used
179
+ self.downsample = nn.Sequential(
180
+ norm_layer(in_chs),
181
+ create_conv2d(
182
+ in_chs,
183
+ out_chs,
184
+ kernel_size=ds_ks,
185
+ stride=stride,
186
+ dilation=dilation[0],
187
+ padding=pad,
188
+ bias=conv_bias,
189
+ ),
190
+ )
191
+ in_chs = out_chs
192
+ else:
193
+ self.downsample = nn.Identity()
194
+
195
+ drop_path_rates = drop_path_rates or [0.0] * depth
196
+ stage_blocks = []
197
+ for i in range(depth):
198
+ stage_blocks.append(
199
+ ConvNeXtBlock(
200
+ in_chs=in_chs,
201
+ out_chs=out_chs,
202
+ kernel_size=kernel_size,
203
+ dilation=dilation[1],
204
+ drop_path=drop_path_rates[i],
205
+ ls_init_value=ls_init_value,
206
+ conv_mlp=conv_mlp,
207
+ conv_bias=conv_bias,
208
+ use_grn=use_grn,
209
+ act_layer=act_layer,
210
+ norm_layer=norm_layer if conv_mlp else norm_layer_cl,
211
+ )
212
+ )
213
+ in_chs = out_chs
214
+ self.blocks = nn.Sequential(*stage_blocks)
215
+
216
+ def forward(self, x):
217
+ x = self.downsample(x)
218
+ if self.grad_checkpointing and not torch.jit.is_scripting():
219
+ x = checkpoint_seq(self.blocks, x)
220
+ else:
221
+ x = self.blocks(x)
222
+ return x
223
+
224
+
225
+ class ConvNeXt(nn.Module):
226
+ r"""ConvNeXt
227
+ A PyTorch impl of : `A ConvNet for the 2020s` - https://arxiv.org/pdf/2201.03545.pdf
228
+ """
229
+
230
+ def __init__(
231
+ self,
232
+ in_chans: int = 3,
233
+ num_classes: int = 1000,
234
+ global_pool: str = "avg",
235
+ output_stride: int = 32,
236
+ depths: Tuple[int, ...] = (3, 3, 9, 3),
237
+ dims: Tuple[int, ...] = (96, 192, 384, 768),
238
+ kernel_sizes: Union[int, Tuple[int, ...]] = 7,
239
+ ls_init_value: Optional[float] = 1e-6,
240
+ stem_type: str = "patch",
241
+ patch_size: int = 4,
242
+ head_init_scale: float = 1.0,
243
+ head_norm_first: bool = False,
244
+ head_hidden_size: Optional[int] = None,
245
+ conv_mlp: bool = False,
246
+ conv_bias: bool = True,
247
+ use_grn: bool = False,
248
+ act_layer: Union[str, Callable] = "gelu",
249
+ norm_layer: Optional[Union[str, Callable]] = None,
250
+ norm_eps: Optional[float] = None,
251
+ drop_rate: float = 0.0,
252
+ drop_path_rate: float = 0.0,
253
+ ):
254
+ """
255
+ Args:
256
+ in_chans: Number of input image channels.
257
+ num_classes: Number of classes for classification head.
258
+ global_pool: Global pooling type.
259
+ output_stride: Output stride of network, one of (8, 16, 32).
260
+ depths: Number of blocks at each stage.
261
+ dims: Feature dimension at each stage.
262
+ kernel_sizes: Depthwise convolution kernel-sizes for each stage.
263
+ ls_init_value: Init value for Layer Scale, disabled if None.
264
+ stem_type: Type of stem.
265
+ patch_size: Stem patch size for patch stem.
266
+ head_init_scale: Init scaling value for classifier weights and biases.
267
+ head_norm_first: Apply normalization before global pool + head.
268
+ head_hidden_size: Size of MLP hidden layer in head if not None and head_norm_first == False.
269
+ conv_mlp: Use 1x1 conv in MLP, improves speed for small networks w/ chan last.
270
+ conv_bias: Use bias layers w/ all convolutions.
271
+ use_grn: Use Global Response Norm (ConvNeXt-V2) in MLP.
272
+ act_layer: Activation layer type.
273
+ norm_layer: Normalization layer type.
274
+ drop_rate: Head pre-classifier dropout rate.
275
+ drop_path_rate: Stochastic depth drop rate.
276
+ """
277
+ super().__init__()
278
+ assert output_stride in (8, 16, 32)
279
+ kernel_sizes = to_ntuple(4)(kernel_sizes)
280
+ if norm_layer is None:
281
+ norm_layer = LayerNorm2d
282
+ norm_layer_cl = norm_layer if conv_mlp else LayerNorm
283
+ if norm_eps is not None:
284
+ norm_layer = partial(norm_layer, eps=norm_eps)
285
+ norm_layer_cl = partial(norm_layer_cl, eps=norm_eps)
286
+ else:
287
+ assert (
288
+ conv_mlp
289
+ ), "If a norm_layer is specified, conv MLP must be used so all norm expect rank-4, channels-first input"
290
+ norm_layer_cl = norm_layer
291
+ if norm_eps is not None:
292
+ norm_layer_cl = partial(norm_layer_cl, eps=norm_eps)
293
+
294
+ self.num_classes = num_classes
295
+ self.drop_rate = drop_rate
296
+ self.feature_info = []
297
+
298
+ assert stem_type in ("patch", "overlap", "overlap_tiered")
299
+ if stem_type == "patch":
300
+ # NOTE: this stem is a minimal form of ViT PatchEmbed, as used in SwinTransformer w/ patch_size = 4
301
+ self.stem = nn.Sequential(
302
+ nn.Conv2d(
303
+ in_chans,
304
+ dims[0],
305
+ kernel_size=patch_size,
306
+ stride=patch_size,
307
+ bias=conv_bias,
308
+ ),
309
+ norm_layer(dims[0]),
310
+ )
311
+ stem_stride = patch_size
312
+ else:
313
+ mid_chs = make_divisible(dims[0] // 2) if "tiered" in stem_type else dims[0]
314
+ self.stem = nn.Sequential(
315
+ nn.Conv2d(
316
+ in_chans,
317
+ mid_chs,
318
+ kernel_size=3,
319
+ stride=2,
320
+ padding=1,
321
+ bias=conv_bias,
322
+ ),
323
+ nn.Conv2d(
324
+ mid_chs, dims[0], kernel_size=3, stride=2, padding=1, bias=conv_bias
325
+ ),
326
+ norm_layer(dims[0]),
327
+ )
328
+ stem_stride = 4
329
+
330
+ self.stages = nn.Sequential()
331
+ dp_rates = [
332
+ x.tolist()
333
+ for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)
334
+ ]
335
+ stages = []
336
+ prev_chs = dims[0]
337
+ curr_stride = stem_stride
338
+ dilation = 1
339
+ # 4 feature resolution stages, each consisting of multiple residual blocks
340
+ for i in range(4):
341
+ stride = 2 if curr_stride == 2 or i > 0 else 1
342
+ if curr_stride >= output_stride and stride > 1:
343
+ dilation *= stride
344
+ stride = 1
345
+ curr_stride *= stride
346
+ first_dilation = 1 if dilation in (1, 2) else 2
347
+ out_chs = dims[i]
348
+ stages.append(
349
+ ConvNeXtStage(
350
+ prev_chs,
351
+ out_chs,
352
+ kernel_size=kernel_sizes[i],
353
+ stride=stride,
354
+ dilation=(first_dilation, dilation),
355
+ depth=depths[i],
356
+ drop_path_rates=dp_rates[i],
357
+ ls_init_value=ls_init_value,
358
+ conv_mlp=conv_mlp,
359
+ conv_bias=conv_bias,
360
+ use_grn=use_grn,
361
+ act_layer=act_layer,
362
+ norm_layer=norm_layer,
363
+ norm_layer_cl=norm_layer_cl,
364
+ )
365
+ )
366
+ prev_chs = out_chs
367
+ # NOTE feature_info use currently assumes stage 0 == stride 1, rest are stride 2
368
+ self.feature_info += [
369
+ dict(num_chs=prev_chs, reduction=curr_stride, module=f"stages.{i}")
370
+ ]
371
+ self.stages = nn.Sequential(*stages)
372
+ self.num_features = self.head_hidden_size = prev_chs
373
+
374
+ # if head_norm_first == true, norm -> global pool -> fc ordering, like most other nets
375
+ # otherwise pool -> norm -> fc, the default ConvNeXt ordering (pretrained FB weights)
376
+ if head_norm_first:
377
+ assert not head_hidden_size
378
+ self.norm_pre = norm_layer(self.num_features)
379
+ self.head = ClassifierHead(
380
+ self.num_features,
381
+ num_classes,
382
+ pool_type=global_pool,
383
+ drop_rate=self.drop_rate,
384
+ )
385
+ else:
386
+ self.norm_pre = nn.Identity()
387
+ self.head = NormMlpClassifierHead(
388
+ self.num_features,
389
+ num_classes,
390
+ hidden_size=head_hidden_size,
391
+ pool_type=global_pool,
392
+ drop_rate=self.drop_rate,
393
+ norm_layer=norm_layer,
394
+ act_layer="gelu",
395
+ )
396
+ self.head_hidden_size = self.head.num_features
397
+ named_apply(partial(_init_weights, head_init_scale=head_init_scale), self)
398
+
399
+ @torch.jit.ignore
400
+ def group_matcher(self, coarse=False):
401
+ return dict(
402
+ stem=r"^stem",
403
+ blocks=(
404
+ r"^stages\.(\d+)"
405
+ if coarse
406
+ else [
407
+ (r"^stages\.(\d+)\.downsample", (0,)), # blocks
408
+ (r"^stages\.(\d+)\.blocks\.(\d+)", None),
409
+ (r"^norm_pre", (99999,)),
410
+ ]
411
+ ),
412
+ )
413
+
414
+ @torch.jit.ignore
415
+ def set_grad_checkpointing(self, enable=True):
416
+ for s in self.stages:
417
+ s.grad_checkpointing = enable
418
+
419
+ @torch.jit.ignore
420
+ def get_classifier(self) -> nn.Module:
421
+ return self.head.fc
422
+
423
+ def reset_classifier(self, num_classes: int, global_pool: Optional[str] = None):
424
+ self.num_classes = num_classes
425
+ self.head.reset(num_classes, global_pool)
426
+
427
+ def forward_intermediates(
428
+ self,
429
+ x: torch.Tensor,
430
+ indices: Optional[Union[int, List[int], Tuple[int]]] = None,
431
+ norm: bool = False,
432
+ stop_early: bool = False,
433
+ output_fmt: str = "NCHW",
434
+ intermediates_only: bool = False,
435
+ ) -> Union[List[torch.Tensor], Tuple[torch.Tensor, List[torch.Tensor]]]:
436
+ """Forward features that returns intermediates.
437
+
438
+ Args:
439
+ x: Input image tensor
440
+ indices: Take last n blocks if int, all if None, select matching indices if sequence
441
+ norm: Apply norm layer to compatible intermediates
442
+ stop_early: Stop iterating over blocks when last desired intermediate hit
443
+ output_fmt: Shape of intermediate feature outputs
444
+ intermediates_only: Only return intermediate features
445
+ Returns:
446
+
447
+ """
448
+ assert output_fmt in ("NCHW",), "Output shape must be NCHW."
449
+ intermediates = []
450
+ take_indices, max_index = feature_take_indices(len(self.stages) + 1, indices)
451
+
452
+ # forward pass
453
+ feat_idx = 0 # stem is index 0
454
+ x = self.stem(x)
455
+ if feat_idx in take_indices:
456
+ intermediates.append(x)
457
+
458
+ if (
459
+ torch.jit.is_scripting() or not stop_early
460
+ ): # can't slice blocks in torchscript
461
+ stages = self.stages
462
+ else:
463
+ stages = self.stages[:max_index]
464
+ for stage in stages:
465
+ feat_idx += 1
466
+ x = stage(x)
467
+ if feat_idx in take_indices:
468
+ # NOTE not bothering to apply norm_pre when norm=True as almost no models have it enabled
469
+ intermediates.append(x)
470
+
471
+ if intermediates_only:
472
+ return intermediates
473
+
474
+ x = self.norm_pre(x)
475
+
476
+ return x, intermediates
477
+
478
+ def prune_intermediate_layers(
479
+ self,
480
+ indices: Union[int, List[int], Tuple[int]] = 1,
481
+ prune_norm: bool = False,
482
+ prune_head: bool = True,
483
+ ):
484
+ """Prune layers not required for specified intermediates."""
485
+ take_indices, max_index = feature_take_indices(len(self.stages) + 1, indices)
486
+ self.stages = self.stages[:max_index] # truncate blocks w/ stem as idx 0
487
+ if prune_norm:
488
+ self.norm_pre = nn.Identity()
489
+ if prune_head:
490
+ self.reset_classifier(0, "")
491
+ return take_indices
492
+
493
+ def forward_features(self, x):
494
+ x = self.stem(x)
495
+ x = self.stages(x)
496
+ x = self.norm_pre(x)
497
+ return x
498
+
499
+ def forward_head(self, x, pre_logits: bool = False):
500
+ return self.head(x, pre_logits=True) if pre_logits else self.head(x)
501
+
502
+ def forward(self, x):
503
+ x = self.forward_features(x)
504
+ x = self.forward_head(x)
505
+ return x
506
+
507
+
508
+ def _init_weights(module, name=None, head_init_scale=1.0):
509
+ if isinstance(module, nn.Conv2d):
510
+ trunc_normal_(module.weight, std=0.02)
511
+ if module.bias is not None:
512
+ nn.init.zeros_(module.bias)
513
+ elif isinstance(module, nn.Linear):
514
+ trunc_normal_(module.weight, std=0.02)
515
+ nn.init.zeros_(module.bias)
516
+ if name and "head." in name:
517
+ module.weight.data.mul_(head_init_scale)
518
+ module.bias.data.mul_(head_init_scale)
519
+
520
+
521
+ def checkpoint_filter_fn(state_dict, model):
522
+ """Remap FB checkpoints -> timm"""
523
+ if "head.norm.weight" in state_dict or "norm_pre.weight" in state_dict:
524
+ return state_dict # non-FB checkpoint
525
+ if "model" in state_dict:
526
+ state_dict = state_dict["model"]
527
+
528
+ out_dict = {}
529
+ if "visual.trunk.stem.0.weight" in state_dict:
530
+ out_dict = {
531
+ k.replace("visual.trunk.", ""): v
532
+ for k, v in state_dict.items()
533
+ if k.startswith("visual.trunk.")
534
+ }
535
+ if "visual.head.proj.weight" in state_dict:
536
+ out_dict["head.fc.weight"] = state_dict["visual.head.proj.weight"]
537
+ out_dict["head.fc.bias"] = torch.zeros(
538
+ state_dict["visual.head.proj.weight"].shape[0]
539
+ )
540
+ elif "visual.head.mlp.fc1.weight" in state_dict:
541
+ out_dict["head.pre_logits.fc.weight"] = state_dict[
542
+ "visual.head.mlp.fc1.weight"
543
+ ]
544
+ out_dict["head.pre_logits.fc.bias"] = state_dict["visual.head.mlp.fc1.bias"]
545
+ out_dict["head.fc.weight"] = state_dict["visual.head.mlp.fc2.weight"]
546
+ out_dict["head.fc.bias"] = torch.zeros(
547
+ state_dict["visual.head.mlp.fc2.weight"].shape[0]
548
+ )
549
+ return out_dict
550
+
551
+ import re
552
+
553
+ for k, v in state_dict.items():
554
+ k = k.replace("downsample_layers.0.", "stem.")
555
+ k = re.sub(r"stages.([0-9]+).([0-9]+)", r"stages.\1.blocks.\2", k)
556
+ k = re.sub(
557
+ r"downsample_layers.([0-9]+).([0-9]+)", r"stages.\1.downsample.\2", k
558
+ )
559
+ k = k.replace("dwconv", "conv_dw")
560
+ k = k.replace("pwconv", "mlp.fc")
561
+ if "grn" in k:
562
+ k = k.replace("grn.beta", "mlp.grn.bias")
563
+ k = k.replace("grn.ramma", "mlp.grn.weight")
564
+ v = v.reshape(v.shape[-1])
565
+ k = k.replace("head.", "head.fc.")
566
+ if k.startswith("norm."):
567
+ k = k.replace("norm", "head.norm")
568
+ if v.ndim == 2 and "head" not in k:
569
+ model_shape = model.state_dict()[k].shape
570
+ v = v.reshape(model_shape)
571
+ out_dict[k] = v
572
+
573
+ return out_dict
574
+
575
+
576
+ def _create_convnext(variant, pretrained=False, **kwargs):
577
+ if kwargs.get("pretrained_cfg", "") == "fcmae":
578
+ # NOTE fcmae pretrained weights have no classifier or final norm-layer (`head.norm`)
579
+ # This is workaround loading with num_classes=0 w/o removing norm-layer.
580
+ kwargs.setdefault("pretrained_strict", False)
581
+
582
+ model = build_model_with_cfg(
583
+ ConvNeXt,
584
+ variant,
585
+ pretrained,
586
+ pretrained_filter_fn=checkpoint_filter_fn,
587
+ feature_cfg=dict(out_indices=(0, 1, 2, 3), flatten_sequential=True),
588
+ **kwargs,
589
+ )
590
+ return model
591
+
592
+
593
+ def convnext_large(pretrained=False, **kwargs) -> ConvNeXt:
594
+ model_args = dict(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536])
595
+ model = _create_convnext(
596
+ "convnext_large", pretrained=pretrained, **dict(model_args, **kwargs)
597
+ )
598
+ return model
599
+
600
+
601
+ class CLIP(nn.Module):
602
+ output_dict: torch.jit.Final[bool]
603
+
604
+ def __init__(
605
+ self,
606
+ embed_dim: int,
607
+ vision_cfg: CLIPVisionCfg,
608
+ quick_gelu: bool = False,
609
+ cast_dtype: Optional[torch.dtype] = None,
610
+ output_dict: bool = False,
611
+ **kwargs,
612
+ ):
613
+ super().__init__()
614
+ self.output_dict = output_dict
615
+
616
+ self.visual = convnext_large()
617
+
618
+
619
+ class ConvNextVisionEncoder(nn.Module):
620
+ def __init__(
621
+ self,
622
+ ):
623
+ super().__init__()
624
+ self.model_type = "convnext_large_d_320"
625
+ self.model_channel = [192, 384, 768, 1536] # stage 0-3
626
+
627
+ clip_model = CLIP(**get_model_config(self.model_type), use_text=False)
628
+
629
+ # decompose stem and stages blocks in vision tower
630
+ self.vision_stem = clip_model.visual.stem
631
+ self.vision_stages = clip_model.visual.stages
632
+
633
+ def forward(self, images):
634
+
635
+ if type(images) is list:
636
+ image_features = []
637
+ for image in images:
638
+ image_feature = self.backbone(
639
+ image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
640
+ )
641
+ image_features.append(image_feature)
642
+ else:
643
+ image_features = self.backbone(
644
+ images.to(device=self.device, dtype=self.dtype),
645
+ )
646
+
647
+ return {
648
+ "image_features": image_features,
649
+ "last_feat": image_features[-1],
650
+ }
651
+
652
+ def backbone(self, images: torch.Tensor) -> Tuple[List[torch.Tensor], List[int]]:
653
+ """Process the input images through the backbone network.
654
+
655
+ Inputs:
656
+ images (torch.Tensor): The input images.
657
+
658
+ Returns:
659
+ Tuple[List[torch.Tensor], List[int]]: A tuple containing a list of feature maps and a
660
+ ist of channels per level.
661
+ """
662
+ with torch.no_grad():
663
+ results = self.basic_forward(images)
664
+ feature_maps = []
665
+
666
+ for _stage in results:
667
+ feature_maps.append(results[_stage].contiguous())
668
+ return feature_maps
669
+
670
+ def basic_forward(self, images):
671
+ results = {}
672
+ x = self.vision_stem(images)
673
+ for _idx in range(len(self.vision_stages)):
674
+ x = self.vision_stages[_idx](x)
675
+ results[f"stage_{_idx}"] = x
676
+ return results
677
+
678
+ @property
679
+ def dtype(self):
680
+ return self.vision_stem[0].weight.dtype
681
+
682
+ @property
683
+ def device(self):
684
+ return self.vision_stem[0].weight.device
685
+
686
+ @property
687
+ def config(self):
688
+ return self.vision_config
689
+
690
+ @property
691
+ def hidden_size(self):
692
+ return sum(self.model_channel)
693
+
694
+
695
+ if __name__ == "__main__":
696
+ model = ConvNextVisionEncoder()
697
+ print(model.state_dict().keys())
generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_implementation": "flash_attention_2",
3
+ "bos_token_id": 151643,
4
+ "delay_load": false,
5
+ "do_sample": true,
6
+ "eos_token_id": [
7
+ 151645,
8
+ 151643
9
+ ],
10
+ "pad_token_id": 151643,
11
+ "repetition_penalty": 1.05,
12
+ "temperature": 0.7,
13
+ "top_k": 20,
14
+ "top_p": 0.8,
15
+ "transformers_version": "4.48.0"
16
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9eaba82827e894601eb5fe8338dd1c9b146ab749ab07287950b9069823743d1
3
+ size 4956876272
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab2bf0f72bd76bb22ec2e430f012067f16faeef970b43ce64abefc3777fcb1b4
3
+ size 2874661528
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
modeling_rexseek.py ADDED
@@ -0,0 +1,666 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import math
3
+ import os
4
+ import re
5
+ from typing import List, Optional, Union
6
+
7
+ import torch
8
+ import torch.nn as nn
9
+ import torch.nn.functional as F
10
+ from torch import nn
11
+ from torchvision.ops import roi_align
12
+ from transformers import (
13
+ AutoConfig,
14
+ AutoModel,
15
+ AutoModelForCausalLM,
16
+ Qwen2Config,
17
+ Qwen2ForCausalLM,
18
+ StoppingCriteria,
19
+ StoppingCriteriaList,
20
+ )
21
+ from transformers.generation.utils import GenerateOutput
22
+ from transformers.utils import logging, strtobool
23
+
24
+ from .clip import CLIPVisionTower
25
+ from .convnext import ConvNextVisionEncoder
26
+
27
+ logger = logging.get_logger(__name__)
28
+
29
+ XLA_USE_BF16 = os.environ.get("XLA_USE_BF16", "0").upper()
30
+ XLA_DOWNCAST_BF16 = os.environ.get("XLA_DOWNCAST_BF16", "0").upper()
31
+
32
+ IGNORE_INDEX = -100
33
+ DEFAULT_PAD_TOKEN_INDEX = 0
34
+ IMAGE_TOKEN_INDEX = -200
35
+ DEFAULT_IMAGE_TOKEN = "<image>"
36
+
37
+ # For Objects
38
+ DEFAULT_OBJECT_TOKEN = "<obj<i>>"
39
+ DEFAULT_OBJECT_FEATURE_TOKEN = "<objfeat>"
40
+ DEFAULT_OBJECT_INDEX = -300
41
+
42
+ # For Grounding
43
+ DEFAULT_GROUNDING_START = "<ground>"
44
+ DEFAULT_GROUNDING_END = "</ground>"
45
+ DEFAULT_GROUNDING_OBJECTS_START = "<objects>"
46
+ DEFAULT_GROUNDING_OBJECTS_END = "</objects>"
47
+
48
+
49
+ def is_fsdp_enabled():
50
+ return (
51
+ torch.distributed.is_available()
52
+ and torch.distributed.is_initialized()
53
+ and strtobool(os.environ.get("ACCELERATE_USE_FSDP", "False")) == 1
54
+ and strtobool(os.environ.get("FSDP_CPU_RAM_EFFICIENT_LOADING", "False")) == 1
55
+ )
56
+
57
+
58
+ class IdentityMap(nn.Module):
59
+ def __init__(self):
60
+ super().__init__()
61
+
62
+ def forward(self, x, *args, **kwargs):
63
+ return x
64
+
65
+ @property
66
+ def config(self):
67
+ return {"mm_projector_type": "identity"}
68
+
69
+
70
+ class SimpleResBlock(nn.Module):
71
+ def __init__(self, channels):
72
+ super().__init__()
73
+ self.pre_norm = nn.LayerNorm(channels)
74
+
75
+ self.proj = nn.Sequential(
76
+ nn.Linear(channels, channels), nn.GELU(), nn.Linear(channels, channels)
77
+ )
78
+
79
+ def forward(self, x):
80
+ x = self.pre_norm(x)
81
+ return x + self.proj(x)
82
+
83
+
84
+ def build_vision_projector(config, start_hidden_size, delay_load=False, **kwargs):
85
+ projector_type = "mlp2x_gelu"
86
+
87
+ mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", projector_type)
88
+ if mlp_gelu_match:
89
+ mlp_depth = int(mlp_gelu_match.group(1))
90
+ modules = [nn.Linear(start_hidden_size, config.hidden_size)]
91
+ for _ in range(1, mlp_depth):
92
+ modules.append(nn.GELU())
93
+ modules.append(nn.Linear(config.hidden_size, config.hidden_size))
94
+ return nn.Sequential(*modules)
95
+
96
+ if projector_type == "identity":
97
+ return IdentityMap()
98
+
99
+ raise ValueError(f"Unknown projector type: {projector_type}")
100
+
101
+
102
+ def get_token_slices(input_ids: torch.Tensor):
103
+ """
104
+ Get slices of tokens based on special markers in the input tensor.
105
+
106
+ Args:
107
+ input_ids (torch.Tensor): A tensor of token IDs where IMAGE_TOKEN_INDEX represents an image token,
108
+ DEFAULT_OBJECT_INDEX represents an object token, and all other values represent text tokens.
109
+
110
+ Returns:
111
+ List[Dict[str, Any]]: A list of dictionaries where each dictionary contains the type of the
112
+ token slice ('text', 'image', 'object') and the span as a list of start and end indices.
113
+ """
114
+ # define type markers and corresponding types
115
+ type_map = {IMAGE_TOKEN_INDEX: "image", DEFAULT_OBJECT_INDEX: "object"}
116
+
117
+ # find the positions of special markers
118
+ image_indices = torch.where(input_ids == IMAGE_TOKEN_INDEX)[0]
119
+ object_indices = torch.where(input_ids == DEFAULT_OBJECT_INDEX)[0]
120
+ if len(object_indices) > 0:
121
+ has_object = True
122
+ else:
123
+ has_object = False
124
+
125
+ # merge all the positions of special markers
126
+ special_indices = torch.cat((image_indices, object_indices))
127
+ special_indices, _ = torch.sort(special_indices)
128
+ special_tokens = input_ids[special_indices]
129
+
130
+ slices = []
131
+ start_idx = 0
132
+
133
+ for i, idx in enumerate(special_indices):
134
+ if start_idx < idx:
135
+ slices.append({"type": "text", "span": [start_idx, idx.item()]})
136
+ token_type = type_map[special_tokens[i].item()]
137
+ slices.append({"type": token_type, "span": [idx.item(), idx.item() + 1]})
138
+ start_idx = idx.item() + 1
139
+
140
+ if start_idx < len(input_ids):
141
+ slices.append({"type": "text", "span": [start_idx, len(input_ids)]})
142
+
143
+ return slices, has_object
144
+
145
+
146
+ class StopWordStoppingCriteria(StoppingCriteria):
147
+ """StopWord stopping criteria."""
148
+
149
+ def __init__(self, tokenizer, stop_word):
150
+ self.tokenizer = tokenizer
151
+ self.stop_word = stop_word
152
+ self.length = len(self.stop_word)
153
+
154
+ def __call__(self, input_ids, *args, **kwargs) -> bool:
155
+ cur_text = self.tokenizer.decode(input_ids[0])
156
+ cur_text = cur_text.replace("\r", "").replace("\n", "")
157
+ return cur_text[-self.length :] == self.stop_word
158
+
159
+
160
+ def get_stop_criteria(
161
+ tokenizer,
162
+ stop_words=[],
163
+ ):
164
+ stop_criteria = StoppingCriteriaList()
165
+ for word in stop_words:
166
+ stop_criteria.append(StopWordStoppingCriteria(tokenizer, word))
167
+ return stop_criteria
168
+
169
+
170
+ def gen_sineembed_for_position(pos_tensor, dim_of_pos_feats):
171
+ """Generate sine position embedding from a position tensor.
172
+
173
+ Args:
174
+ pos_tensor (torch.Tensor): shape: [batch_size, N, 4]. the last dimension is [cx, cy, w, h] in
175
+ normalized coordinates in range [0, 1].
176
+ out_dim (int): the output dimension of the position embedding.
177
+
178
+ Returns:
179
+ pos (torch.Tensor): shape: [batch_size, N, out_dim].
180
+ """
181
+ scale = 2 * math.pi
182
+ dim_t = torch.arange(
183
+ dim_of_pos_feats, dtype=torch.float32, device=pos_tensor.device
184
+ )
185
+ dim_t = 10000 ** (2 * (dim_t // 2) / dim_of_pos_feats)
186
+ x_embed = pos_tensor[:, :, 0] * scale
187
+ y_embed = pos_tensor[:, :, 1] * scale
188
+ pos_x = x_embed[:, :, None] / dim_t
189
+ pos_y = y_embed[:, :, None] / dim_t
190
+ pos_x = torch.stack(
191
+ (pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3
192
+ ).flatten(2)
193
+ pos_y = torch.stack(
194
+ (pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3
195
+ ).flatten(2)
196
+ if pos_tensor.size(-1) == 2:
197
+ pos = torch.cat((pos_y, pos_x), dim=2)
198
+ elif pos_tensor.size(-1) == 4:
199
+ w_embed = pos_tensor[:, :, 2] * scale
200
+ pos_w = w_embed[:, :, None] / dim_t
201
+ pos_w = torch.stack(
202
+ (pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3
203
+ ).flatten(2)
204
+
205
+ h_embed = pos_tensor[:, :, 3] * scale
206
+ pos_h = h_embed[:, :, None] / dim_t
207
+ pos_h = torch.stack(
208
+ (pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3
209
+ ).flatten(2)
210
+
211
+ pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
212
+ else:
213
+ raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1)))
214
+ return pos
215
+
216
+
217
+ class MultiLevelROIVisualPrompt(nn.Module):
218
+ """Initialize the MultiLevelROIVisualPrompt.
219
+
220
+ Args:
221
+ output_size (Optional[int]): The size of the output. Default is None.
222
+ channel_per_level (List[int]): List of channels per level. Default is [192, 384, 768, 1536].
223
+ spatial_scale (Optional[float]): The spatial scale factor. Default is None.
224
+ with_additional_projection (bool): Whether to use additional projection. Default is False.
225
+ visual_prompt_hidden_size (int): The hidden size of the visual prompt. Default is 1024.
226
+ add_pos_embedding (bool): Whether to add position embedding. Default is False.
227
+ pos_embedding_dim (int): The dimension of the position embedding. Default is 1024.
228
+ """
229
+
230
+ def __init__(
231
+ self,
232
+ output_size: int = None,
233
+ channel_per_level: List[int] = [192, 384, 768, 1536],
234
+ spatail_scale: float = None,
235
+ add_pos_embedding: bool = False,
236
+ pos_embedding_dim: int = 1024,
237
+ ):
238
+ super(MultiLevelROIVisualPrompt, self).__init__()
239
+ self.output_size = output_size
240
+ self.channel_per_level = channel_per_level
241
+ self.spatail_scale = spatail_scale
242
+ self.add_pos_embedding = add_pos_embedding
243
+ self.pos_embedding_dim = pos_embedding_dim
244
+
245
+ def __call__(
246
+ self,
247
+ multi_level_features: List[torch.Tensor],
248
+ boxes: Union[torch.Tensor, List[torch.Tensor]],
249
+ ) -> torch.Tensor:
250
+ """Performs Region of Interest (RoI) Align operator on multi-level features. The RoI
251
+ feature on each scale will go through a different linear layer for projection. Different
252
+ RoI features will be summed up and then average pooled.
253
+
254
+ Args:
255
+ multi_level_features (Listp[Tensor[N, C, H, W]]): Feature maps from different levels
256
+ boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2)
257
+ format where the regions will be taken from.
258
+ Returns:
259
+ Tensor[1, K, C]: The output tensor that has the shape KxC, where K is the number of RoIs
260
+ """
261
+ boxes[0] = boxes[0].float()
262
+ concat_multi_level_feature = []
263
+ max_height = max([feature.shape[2] for feature in multi_level_features])
264
+ max_width = max([feature.shape[3] for feature in multi_level_features])
265
+ # interpolate to the same size
266
+ for level, feature in enumerate(multi_level_features):
267
+ if level != 0:
268
+ concat_multi_level_feature.append(
269
+ F.interpolate(
270
+ feature.float(),
271
+ size=(max_height, max_width),
272
+ mode="bilinear",
273
+ align_corners=False,
274
+ )
275
+ )
276
+ else:
277
+ concat_multi_level_feature.append(feature.float())
278
+ concat_multi_level_feature = torch.cat(concat_multi_level_feature, dim=1)
279
+
280
+ out_box_feat = roi_align(
281
+ concat_multi_level_feature,
282
+ boxes,
283
+ output_size=self.output_size,
284
+ spatial_scale=self.spatail_scale,
285
+ )
286
+
287
+ # Average Pooling -> n,c -> 1,n,c
288
+ out_box_feat = out_box_feat.mean(dim=(2, 3)).reshape(
289
+ 1, out_box_feat.shape[0], out_box_feat.shape[1]
290
+ )
291
+ if self.add_pos_embedding:
292
+ # note that this boxes is in xyxy, unormalized format, so we need to normalize it first
293
+ boxes = boxes[0] # (N, 4)
294
+ boxes = boxes.to(out_box_feat.dtype)
295
+ original_img_width = max_width / self.spatail_scale
296
+ original_img_height = max_height / self.spatail_scale
297
+ boxes[:, [0, 2]] = boxes[:, [0, 2]] / original_img_width
298
+ boxes[:, [1, 3]] = boxes[:, [1, 3]] / original_img_height
299
+ # convert from xyxy to cx, cy, w, h
300
+ boxes[:, 2] = boxes[:, 2] - boxes[:, 0]
301
+ boxes[:, 3] = boxes[:, 3] - boxes[:, 1]
302
+ boxes[:, 0] = boxes[:, 0] + boxes[:, 2] / 2
303
+ boxes[:, 1] = boxes[:, 1] + boxes[:, 3] / 2
304
+ pos_embed = gen_sineembed_for_position(
305
+ boxes.unsqueeze(0), self.pos_embedding_dim // 4
306
+ )
307
+ out_box_feat = out_box_feat + pos_embed
308
+
309
+ return out_box_feat
310
+
311
+
312
+ class RexSeekQwenConfig(Qwen2Config):
313
+ model_type = "rexseek_qwen"
314
+
315
+
316
+ class RexSeekQwenForCausalLM(Qwen2ForCausalLM):
317
+
318
+ config_class = RexSeekQwenConfig
319
+
320
+ def __init__(self, config):
321
+ super().__init__(config)
322
+ # low resolusion vision encoder
323
+ vision_tower = getattr(
324
+ config,
325
+ "mm_vision_tower",
326
+ getattr(config, "vision_tower", None),
327
+ )
328
+ self.vision_tower = CLIPVisionTower(
329
+ vision_tower,
330
+ args=config,
331
+ )
332
+ # high resolusion vision encoder
333
+ self.vision_tower_aux = ConvNextVisionEncoder()
334
+
335
+ # vision projector
336
+ self.mm_projector = build_vision_projector(
337
+ config, start_hidden_size=2560
338
+ ) # projector for vision_tower
339
+ # projector for object token
340
+ self.mm_object_projector = build_vision_projector(
341
+ config, start_hidden_size=2880
342
+ )
343
+ # visual prompt encoder
344
+ self.vocab_size = config.vocab_size
345
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
346
+ # Initialize weights and apply final processing
347
+ self.box_encoder = MultiLevelROIVisualPrompt(
348
+ output_size=7,
349
+ channel_per_level=[192, 384, 768, 1536], # ConvNeXt Large
350
+ spatail_scale=192 / 768,
351
+ add_pos_embedding=True,
352
+ pos_embedding_dim=2880,
353
+ )
354
+ self.post_init()
355
+ print("model initialized")
356
+
357
+ def get_vision_tower(self):
358
+ vision_tower = getattr(self, "vision_tower", None)
359
+ if type(vision_tower) is list:
360
+ vision_tower = vision_tower[0]
361
+ return vision_tower
362
+
363
+ def get_vision_tower_aux(self):
364
+ vision_tower_aux = getattr(self, "vision_tower_aux", None)
365
+ if type(vision_tower_aux) is list:
366
+ vision_tower_aux = vision_tower_aux[0]
367
+ return vision_tower_aux
368
+
369
+ def get_model(self):
370
+ return self.model
371
+
372
+ def encode_images(self, images, images_aux):
373
+ low_res_feat = self.get_vision_tower()(images)
374
+ aux_output = self.get_vision_tower_aux()(images_aux)
375
+ visual_outputs_aux = aux_output["image_features"]
376
+ high_res_feat = aux_output["last_feat"] # (B, 1536, 24, 24)
377
+ # concat the low res features with the high res features
378
+ b, c, h, w = high_res_feat.shape # (2, 1536, 24, 24)
379
+ _, _, d = low_res_feat.shape # (2, 576, 1024)
380
+ high_res_feat = high_res_feat.view(b, c, h * w).transpose(1, 2)
381
+ image_features = torch.cat((low_res_feat, high_res_feat), dim=-1)
382
+ image_features = self.mm_projector(image_features)
383
+ return image_features, visual_outputs_aux
384
+
385
+ def encode_objects(
386
+ self, bboxes, visual_outputs_aux, dtype, num_gt_boxes_per_image=None
387
+ ):
388
+ """Encode object features from bounding boxes.
389
+
390
+ Args:
391
+ bboxes (torch.Tensor): bounding boxes in the shape of (N, 4)
392
+ image_features_before_proj (torch.Tensor): image features in the shape of (N, hidden_size)
393
+
394
+ Returns:
395
+ torch.Tensor: object features in the shape of (N, hidden_size)
396
+ """
397
+ bbox_visual_outputs = []
398
+ for batch_idx, boxes in enumerate(bboxes):
399
+ num_box = (
400
+ num_gt_boxes_per_image[batch_idx]
401
+ if num_gt_boxes_per_image is not None
402
+ else len(boxes)
403
+ )
404
+ boxes = boxes[:num_box]
405
+ if len(boxes) == 0:
406
+ bbox_visual_outputs.append(None)
407
+ continue
408
+ multi_level_aux_features = [
409
+ visual_output_aux[batch_idx].unsqueeze(0)
410
+ for visual_output_aux in visual_outputs_aux
411
+ ]
412
+ out_vp_feat = self.box_encoder(
413
+ multi_level_aux_features,
414
+ [boxes],
415
+ ).squeeze(0)
416
+ out_vp_feat = out_vp_feat.to(dtype)
417
+ out_vp_feat = self.mm_object_projector(out_vp_feat)
418
+ bbox_visual_outputs.append(out_vp_feat)
419
+ # b,n,c
420
+ return bbox_visual_outputs
421
+
422
+ def prepare_inputs_labels_for_multimodal(
423
+ self,
424
+ input_ids,
425
+ position_ids,
426
+ attention_mask,
427
+ past_key_values,
428
+ labels,
429
+ pixel_values=None,
430
+ pixel_values_aux=None,
431
+ gt_boxes=None,
432
+ num_gt_boxes_per_image=None,
433
+ ):
434
+ if pixel_values is None:
435
+ return (
436
+ input_ids,
437
+ position_ids,
438
+ attention_mask,
439
+ past_key_values,
440
+ None,
441
+ labels,
442
+ )
443
+ pixel_values, visual_outputs_aux = self.encode_images(
444
+ pixel_values, pixel_values_aux
445
+ ) # (B, 576, 2048)
446
+ if gt_boxes is not None:
447
+ bbox_feats = self.encode_objects(
448
+ gt_boxes, visual_outputs_aux, pixel_values.dtype, num_gt_boxes_per_image
449
+ )
450
+ _labels = labels
451
+ _position_ids = position_ids
452
+ _attention_mask = attention_mask
453
+ if attention_mask is None:
454
+ attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
455
+ else:
456
+ attention_mask = attention_mask.bool() # padding mask in shaoe (B, L)
457
+ if position_ids is None:
458
+ position_ids = torch.arange(
459
+ 0, input_ids.shape[1], dtype=torch.long, device=input_ids.device
460
+ )
461
+ if labels is None:
462
+ labels = torch.full_like(input_ids, IGNORE_INDEX)
463
+
464
+ input_ids = [
465
+ cur_input_ids[cur_attention_mask]
466
+ for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)
467
+ ]
468
+ labels = [
469
+ cur_labels[cur_attention_mask]
470
+ for cur_labels, cur_attention_mask in zip(labels, attention_mask)
471
+ ]
472
+
473
+ new_input_embeds = []
474
+ new_labels = []
475
+ cur_image_idx = 0
476
+ cur_object_idx = 0
477
+ for batch_idx, cur_input_ids in enumerate(input_ids):
478
+ num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
479
+ if num_images == 0:
480
+ cur_image_features = pixel_values[cur_image_idx]
481
+ cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
482
+ cur_input_embeds = torch.cat(
483
+ [cur_input_embeds_1, cur_image_features[0:0]], dim=0
484
+ )
485
+ new_input_embeds.append(cur_input_embeds)
486
+ new_labels.append(labels[batch_idx])
487
+ cur_image_idx += 1
488
+ cur_object_idx += 1
489
+ continue
490
+
491
+ cur_labels = labels[batch_idx]
492
+ token_slices, has_object = get_token_slices(cur_input_ids)
493
+ result_input_embeddings = []
494
+ result_output_labels = []
495
+ cur_gt_bnox_indice = 0
496
+ cur_object_features = None
497
+ for slice in token_slices:
498
+ slice_type = slice["type"]
499
+ slice_span = slice["span"]
500
+ if slice_type == "text":
501
+ cur_input_ids_noim = cur_input_ids[slice_span[0] : slice_span[1]]
502
+ cur_labels_noim = cur_labels[slice_span[0] : slice_span[1]]
503
+ cur_input_embeds = self.get_model().embed_tokens(cur_input_ids_noim)
504
+ result_input_embeddings.append(cur_input_embeds)
505
+ result_output_labels.append(cur_labels_noim)
506
+ elif slice_type == "image":
507
+ cur_input_embeds = pixel_values[cur_image_idx]
508
+ result_input_embeddings.append(cur_input_embeds)
509
+ result_output_labels.append(
510
+ torch.full(
511
+ (cur_input_embeds.shape[0],),
512
+ IGNORE_INDEX,
513
+ device=cur_labels.device,
514
+ dtype=cur_labels.dtype,
515
+ )
516
+ )
517
+ cur_image_idx += 1
518
+ elif slice_type == "object":
519
+ try:
520
+ result_input_embeddings.append(
521
+ bbox_feats[cur_object_idx][cur_gt_bnox_indice].unsqueeze(0)
522
+ )
523
+ except:
524
+ raise ValueError(
525
+ f"current boxe_feats.shape: {bbox_feats[cur_object_idx].shape}, "
526
+ )
527
+ cur_gt_bnox_indice += 1
528
+ result_output_labels.append(
529
+ torch.full(
530
+ (1,),
531
+ IGNORE_INDEX,
532
+ device=cur_labels.device,
533
+ dtype=cur_labels.dtype,
534
+ )
535
+ )
536
+ cur_object_idx += 1
537
+ result_input_embeddings = torch.cat(result_input_embeddings)
538
+ result_output_labels = torch.cat(result_output_labels)
539
+ assert len(result_output_labels) == len(result_input_embeddings)
540
+ new_input_embeds.append(result_input_embeddings)
541
+ new_labels.append(result_output_labels)
542
+
543
+ # Truncate sequences to max length as image embeddings can make the sequence longer
544
+ tokenizer_model_max_length = getattr(
545
+ self.config, "tokenizer_model_max_length", None
546
+ )
547
+ if tokenizer_model_max_length is not None:
548
+ new_input_embeds = [
549
+ x[:tokenizer_model_max_length] for x in new_input_embeds
550
+ ]
551
+ new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
552
+
553
+ # Combine them
554
+ max_len = max(x.shape[0] for x in new_input_embeds)
555
+ batch_size = len(new_input_embeds)
556
+
557
+ new_input_embeds_padded = []
558
+ new_labels_padded = torch.full(
559
+ (batch_size, max_len),
560
+ IGNORE_INDEX,
561
+ dtype=new_labels[0].dtype,
562
+ device=new_labels[0].device,
563
+ )
564
+ attention_mask = torch.zeros(
565
+ (batch_size, max_len),
566
+ dtype=attention_mask.dtype,
567
+ device=attention_mask.device,
568
+ )
569
+ position_ids = torch.zeros(
570
+ (batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device
571
+ )
572
+
573
+ for i, (cur_new_embed, cur_new_labels) in enumerate(
574
+ zip(new_input_embeds, new_labels)
575
+ ):
576
+ cur_len = cur_new_embed.shape[0]
577
+ new_input_embeds_padded.append(
578
+ torch.cat(
579
+ (
580
+ cur_new_embed,
581
+ torch.zeros(
582
+ (max_len - cur_len, cur_new_embed.shape[1]),
583
+ dtype=cur_new_embed.dtype,
584
+ device=cur_new_embed.device,
585
+ ),
586
+ ),
587
+ dim=0,
588
+ )
589
+ )
590
+ if cur_len > 0:
591
+ new_labels_padded[i, :cur_len] = cur_new_labels
592
+ attention_mask[i, :cur_len] = True
593
+ position_ids[i, :cur_len] = torch.arange(
594
+ 0, cur_len, dtype=position_ids.dtype, device=position_ids.device
595
+ )
596
+
597
+ new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
598
+
599
+ if _labels is None:
600
+ new_labels = None
601
+ else:
602
+ new_labels = new_labels_padded
603
+
604
+ if _attention_mask is None:
605
+ attention_mask = None
606
+ else:
607
+ attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
608
+
609
+ if _position_ids is None:
610
+ position_ids = None
611
+
612
+ return (
613
+ None,
614
+ position_ids,
615
+ attention_mask,
616
+ past_key_values,
617
+ new_input_embeds,
618
+ new_labels,
619
+ )
620
+
621
+ @torch.no_grad()
622
+ def generate(
623
+ self,
624
+ inputs: Optional[torch.Tensor],
625
+ pixel_values: Optional[torch.Tensor],
626
+ pixel_values_aux: Optional[torch.Tensor],
627
+ position_ids: Optional[torch.Tensor] = None,
628
+ attention_mask: Optional[torch.Tensor] = None,
629
+ inputs_embeds: Optional[torch.Tensor] = None,
630
+ **kwargs,
631
+ ) -> Union[GenerateOutput, torch.LongTensor]:
632
+
633
+ if inputs_embeds is None:
634
+ position_ids = kwargs.pop("position_ids", None)
635
+ attention_mask = kwargs.pop("attention_mask", None)
636
+ gt_boxes = kwargs.pop("gt_boxes", None)
637
+ num_gt_boxes_per_image = kwargs.pop("num_gt_boxes_per_image", None)
638
+
639
+ if pixel_values is not None:
640
+ (inputs, position_ids, attention_mask, _, inputs_embeds, _) = (
641
+ self.prepare_inputs_labels_for_multimodal(
642
+ inputs,
643
+ position_ids,
644
+ attention_mask,
645
+ past_key_values=None,
646
+ labels=None,
647
+ pixel_values=pixel_values,
648
+ pixel_values_aux=pixel_values_aux,
649
+ gt_boxes=gt_boxes,
650
+ num_gt_boxes_per_image=num_gt_boxes_per_image,
651
+ )
652
+ )
653
+
654
+ else:
655
+ inputs_embeds = self.get_model().embed_tokens(inputs)
656
+
657
+ return super().generate(
658
+ position_ids=position_ids,
659
+ attention_mask=attention_mask,
660
+ inputs_embeds=inputs_embeds,
661
+ **kwargs,
662
+ )
663
+
664
+
665
+ AutoConfig.register("rexseek_qwen", RexSeekQwenConfig)
666
+ AutoModelForCausalLM.register(RexSeekQwenConfig, RexSeekQwenForCausalLM)
preprocessing_rexseek.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+
3
+
4
+ import re
5
+ from typing import List, Union
6
+
7
+ import numpy as np
8
+ import torch
9
+ import torchvision.transforms.functional as F
10
+ from transformers import AutoTokenizer
11
+
12
+ from transformers.processing_utils import ProcessorMixin
13
+
14
+ from transformers.utils import logging
15
+
16
+ logger = logging.get_logger(__name__)
17
+
18
+
19
+ IGNORE_INDEX = -100
20
+ DEFAULT_PAD_TOKEN_INDEX = 0
21
+ IMAGE_TOKEN_INDEX = -200
22
+ DEFAULT_IMAGE_TOKEN = "<image>"
23
+
24
+ # For Objects
25
+ DEFAULT_OBJECT_TOKEN = "<obj<i>>"
26
+ DEFAULT_OBJECT_FEATURE_TOKEN = "<objfeat>"
27
+ DEFAULT_OBJECT_INDEX = -300
28
+
29
+ # For Grounding
30
+ DEFAULT_GROUNDING_START = "<ground>"
31
+ DEFAULT_GROUNDING_END = "</ground>"
32
+ DEFAULT_GROUNDING_OBJECTS_START = "<objects>"
33
+ DEFAULT_GROUNDING_OBJECTS_END = "</objects>"
34
+
35
+
36
+ def xyxy_to_xywh(boxes):
37
+ """
38
+ Convert boxes from xywh to xyxy format.
39
+
40
+ Parameters:
41
+ boxes (numpy.ndarray): An array of shape (N, 4) where N is the number of boxes.
42
+ Each box is represented as [x, y, x, y].
43
+
44
+ Returns:
45
+ numpy.ndarray: An array of shape (N, 4) where each box is represented as [x_min, y_min, w, h].
46
+ """
47
+ boxes = np.array(boxes)
48
+ x_min, y_min, x_max, y_max = (
49
+ boxes[:, 0],
50
+ boxes[:, 1],
51
+ boxes[:, 2],
52
+ boxes[:, 3],
53
+ )
54
+ w = x_max - x_min
55
+ h = y_max - y_min
56
+ return np.stack([x_min, y_min, w, h], axis=1)
57
+
58
+
59
+ def xywh_to_xyxy(boxes):
60
+ """
61
+ Convert boxes from xywh to xyxy format.
62
+
63
+ Parameters:
64
+ boxes (numpy.ndarray): An array of shape (N, 4) where N is the number of boxes.
65
+ Each box is represented as [x, y, width, height].
66
+
67
+ Returns:
68
+ numpy.ndarray: An array of shape (N, 4) where each box is represented as [x_min, y_min, x_max, y_max].
69
+ """
70
+ boxes = np.array(boxes)
71
+ x, y, width, height = (
72
+ boxes[:, 0],
73
+ boxes[:, 1],
74
+ boxes[:, 2],
75
+ boxes[:, 3],
76
+ )
77
+ x_max = x + width
78
+ y_max = y + height
79
+ return np.stack([x, y, x_max, y_max], axis=1)
80
+
81
+
82
+ def expand2square(pil_img, background_color):
83
+ width, height = pil_img.size
84
+ if width == height:
85
+ return pil_img
86
+ elif width > height:
87
+ result = Image.new(pil_img.mode, (width, width), background_color)
88
+ result.paste(pil_img, (0, (width - height) // 2))
89
+ return result
90
+ else:
91
+ result = Image.new(pil_img.mode, (height, height), background_color)
92
+ result.paste(pil_img, ((height - width) // 2, 0))
93
+ return result
94
+
95
+
96
+ def pad_boxes(gt_boxes, old_size):
97
+ old_w, old_h = old_size
98
+ gt_boxes = np.array(gt_boxes).astype(np.float32)
99
+ # Calculate the padding added
100
+ if old_w > old_h:
101
+ pad_top = (old_w - old_h) // 2
102
+ pad_bottom = old_w - old_h - pad_top
103
+ pad_left, pad_right = 0, 0
104
+ else:
105
+ pad_left = (old_h - old_w) // 2
106
+ pad_right = old_h - old_w - pad_left
107
+ pad_top, pad_bottom = 0, 0
108
+
109
+ # Adjust the boxes for padding
110
+ gt_boxes[:, 0] += pad_left # x
111
+ gt_boxes[:, 1] += pad_top # y
112
+ return gt_boxes
113
+
114
+
115
+ def resize_boxes(gt_boxes, old_size, new_size):
116
+ old_w, old_h = old_size
117
+ new_h, new_w = new_size
118
+ gt_boxes = np.array(gt_boxes).astype(np.float32)
119
+ # Calculate scale factors
120
+ scale_x = new_w / max(old_w, old_h)
121
+ scale_y = new_h / max(old_w, old_h)
122
+
123
+ # Resize the boxes
124
+ gt_boxes[:, 0] *= scale_x # x
125
+ gt_boxes[:, 1] *= scale_y # y
126
+ gt_boxes[:, 2] *= scale_x # w
127
+ gt_boxes[:, 3] *= scale_y # h
128
+
129
+ return gt_boxes
130
+
131
+
132
+ def split_special_strings(input_string: str, special_strings: list[str] = None):
133
+ """Split the input string into a list of strings, keeping the special strings.
134
+
135
+ Args:
136
+ input_string (str): The input string to split.
137
+
138
+ Example:
139
+
140
+ input_string = "<image>\n<obj0><objfeat><obj1><objfeat>\n I am happy today."
141
+ output = ['<image>', '\n<obj0>', '<objfeat>', '<obj1>', '<objfeat>', '\n I am happy today.']
142
+
143
+ Returns:
144
+ list: A list of strings, with the special strings separated from the rest of the input string.
145
+ """
146
+ # Create a regex pattern to match the special strings
147
+ pattern = "|".join(map(re.escape, special_strings))
148
+
149
+ # Split the input string using the pattern, keeping the special strings in the result
150
+ split_list = re.split(f"({pattern})", input_string)
151
+
152
+ # Remove empty strings from the list
153
+ split_list = [s for s in split_list if s]
154
+
155
+ return split_list
156
+
157
+
158
+ def tokenizer_image_object_token(prompt, tokenizer):
159
+ bos_token_id = tokenizer.bos_token_id
160
+ split_tokens = [DEFAULT_IMAGE_TOKEN, DEFAULT_OBJECT_FEATURE_TOKEN]
161
+ chunks = split_special_strings(prompt, split_tokens)
162
+ input_encode = [bos_token_id] if bos_token_id else []
163
+ for chunk in chunks:
164
+ if chunk == DEFAULT_IMAGE_TOKEN:
165
+ input_encode.append(IMAGE_TOKEN_INDEX)
166
+ elif chunk == DEFAULT_OBJECT_FEATURE_TOKEN:
167
+ input_encode.append(DEFAULT_OBJECT_INDEX)
168
+ else:
169
+ input_encode.extend(tokenizer.encode(chunk, add_special_tokens=False))
170
+ return input_encode
171
+
172
+
173
+ class RexSeekProcessor(ProcessorMixin):
174
+ attributes = ["image_processor", "tokenizer"]
175
+ image_processor_class = "AutoImageProcessor"
176
+ tokenizer_class = "AutoTokenizer"
177
+
178
+ def __init__(self, image_processor=None, tokenizer: AutoTokenizer = None, **kwargs):
179
+ # self.image_processor = image_processor
180
+ # self.tokenizer = tokenizer
181
+ super().__init__(image_processor, tokenizer)
182
+ self._special_tokens = None
183
+ self.template = dict(
184
+ SYSTEM=("<|im_start|>system\n{system}<|im_end|>\n"),
185
+ INSTRUCTION=(
186
+ "<|im_start|>user\n{input}<|im_end|>\n" "<|im_start|>assistant\n"
187
+ ),
188
+ SUFFIX="<|im_end|>",
189
+ SUFFIX_AS_EOS=True,
190
+ SEP="\n",
191
+ STOP_WORDS=["<|im_end|>", "<|endoftext|>"],
192
+ )
193
+
194
+ def process(
195
+ self,
196
+ image: Union[str, Image.Image],
197
+ bbox: List[List[int]],
198
+ question: str,
199
+ ):
200
+ """Prepare input data for inference.
201
+
202
+ Args:
203
+ image (Union[str, Image.Image]): The image to process.
204
+ bbox (List[List[int]]): A list of bounding boxes for the image. Each bounding box should
205
+ be in order of [x, y, x , y].
206
+ question (str): The question to ask about the image.
207
+ """
208
+ data_dict = {}
209
+ # step1 load image
210
+ if type(image) == str:
211
+ image = Image.open(image).convert("RGB")
212
+ ori_w, ori_h = F.get_image_size(image)
213
+ image = expand2square(
214
+ image,
215
+ tuple(int(x * 255) for x in self.image_processor.image_mean),
216
+ )
217
+ pad_w, pad_h = F.get_image_size(image)
218
+ image_aux = self.image_processor.preprocess(image, return_tensors="pt")[
219
+ "pixel_values"
220
+ ][0]
221
+ resize_h, resize_w = image_aux.shape[-2:]
222
+ data_dict["pixel_values_aux"] = image_aux.unsqueeze(0)
223
+ image = image_aux.clone()
224
+ image = torch.nn.functional.interpolate(
225
+ image[None],
226
+ size=[336, 336],
227
+ mode="bilinear",
228
+ align_corners=False,
229
+ )[0]
230
+ data_dict["pixel_values"] = image.unsqueeze(0)
231
+
232
+ # step2 load boxes
233
+ bbox = xyxy_to_xywh(bbox)
234
+ bbox = pad_boxes(bbox, (ori_w, ori_h))
235
+ bbox = resize_boxes(bbox, (pad_w, pad_h), (resize_h, resize_w))
236
+ data_dict["gt_boxes"] = torch.tensor(xywh_to_xyxy(bbox)).unsqueeze(0)
237
+
238
+ # step3 prepare question
239
+ total_num_boxes = len(bbox)
240
+ obj_tokens = [
241
+ DEFAULT_OBJECT_TOKEN.replace("<i>", str(i)) for i in range(total_num_boxes)
242
+ ]
243
+ obj_tokens = (
244
+ DEFAULT_OBJECT_FEATURE_TOKEN.join(obj_tokens) + DEFAULT_OBJECT_FEATURE_TOKEN
245
+ )
246
+ question = question.replace(DEFAULT_IMAGE_TOKEN, "")
247
+ question = DEFAULT_IMAGE_TOKEN + "\n" + obj_tokens + "\n" + question
248
+
249
+ inputs = ""
250
+ inputs += self.template["INSTRUCTION"].format(input=question, round=1)
251
+
252
+ # step4 tokenize question
253
+ input_ids = tokenizer_image_object_token(inputs, self.tokenizer)
254
+ data_dict["input_ids"] = torch.tensor(input_ids).unsqueeze(0)
255
+
256
+ return data_dict
257
+
258
+
259
+ RexSeekProcessor.register_for_auto_class()
preprocessor_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 768,
4
+ "width": 768
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "image_mean": [
12
+ 0.48145466,
13
+ 0.4578275,
14
+ 0.40821073
15
+ ],
16
+ "image_processor_type": "CLIPImageProcessor",
17
+ "image_std": [
18
+ 0.26862954,
19
+ 0.26130258,
20
+ 0.27577711
21
+ ],
22
+ "processor_class": "ChatRexProcessor",
23
+ "resample": 3,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "shortest_edge": 768
27
+ }
28
+ }
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "preprocessing_rexseek.RexSeekProcessor"
4
+ },
5
+ "processor_class": "RexSeekProcessor"
6
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>",
16
+ "<obj0>",
17
+ "<obj1>",
18
+ "<obj2>",
19
+ "<obj3>",
20
+ "<obj4>",
21
+ "<obj5>",
22
+ "<obj6>",
23
+ "<obj7>",
24
+ "<obj8>",
25
+ "<obj9>",
26
+ "<obj10>",
27
+ "<obj11>",
28
+ "<obj12>",
29
+ "<obj13>",
30
+ "<obj14>",
31
+ "<obj15>",
32
+ "<obj16>",
33
+ "<obj17>",
34
+ "<obj18>",
35
+ "<obj19>",
36
+ "<obj20>",
37
+ "<obj21>",
38
+ "<obj22>",
39
+ "<obj23>",
40
+ "<obj24>",
41
+ "<obj25>",
42
+ "<obj26>",
43
+ "<obj27>",
44
+ "<obj28>",
45
+ "<obj29>",
46
+ "<obj30>",
47
+ "<obj31>",
48
+ "<obj32>",
49
+ "<obj33>",
50
+ "<obj34>",
51
+ "<obj35>",
52
+ "<obj36>",
53
+ "<obj37>",
54
+ "<obj38>",
55
+ "<obj39>",
56
+ "<obj40>",
57
+ "<obj41>",
58
+ "<obj42>",
59
+ "<obj43>",
60
+ "<obj44>",
61
+ "<obj45>",
62
+ "<obj46>",
63
+ "<obj47>",
64
+ "<obj48>",
65
+ "<obj49>",
66
+ "<obj50>",
67
+ "<obj51>",
68
+ "<obj52>",
69
+ "<obj53>",
70
+ "<obj54>",
71
+ "<obj55>",
72
+ "<obj56>",
73
+ "<obj57>",
74
+ "<obj58>",
75
+ "<obj59>",
76
+ "<obj60>",
77
+ "<obj61>",
78
+ "<obj62>",
79
+ "<obj63>",
80
+ "<obj64>",
81
+ "<obj65>",
82
+ "<obj66>",
83
+ "<obj67>",
84
+ "<obj68>",
85
+ "<obj69>",
86
+ "<obj70>",
87
+ "<obj71>",
88
+ "<obj72>",
89
+ "<obj73>",
90
+ "<obj74>",
91
+ "<obj75>",
92
+ "<obj76>",
93
+ "<obj77>",
94
+ "<obj78>",
95
+ "<obj79>",
96
+ "<obj80>",
97
+ "<obj81>",
98
+ "<obj82>",
99
+ "<obj83>",
100
+ "<obj84>",
101
+ "<obj85>",
102
+ "<obj86>",
103
+ "<obj87>",
104
+ "<obj88>",
105
+ "<obj89>",
106
+ "<obj90>",
107
+ "<obj91>",
108
+ "<obj92>",
109
+ "<obj93>",
110
+ "<obj94>",
111
+ "<obj95>",
112
+ "<obj96>",
113
+ "<obj97>",
114
+ "<obj98>",
115
+ "<obj99>",
116
+ "<ground>",
117
+ "</ground>",
118
+ "<objects>",
119
+ "</objects>"
120
+ ],
121
+ "eos_token": {
122
+ "content": "<|im_end|>",
123
+ "lstrip": false,
124
+ "normalized": false,
125
+ "rstrip": false,
126
+ "single_word": false
127
+ }
128
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,1145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<obj0>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": true
188
+ },
189
+ "151666": {
190
+ "content": "<obj1>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": true
196
+ },
197
+ "151667": {
198
+ "content": "<obj2>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": true
204
+ },
205
+ "151668": {
206
+ "content": "<obj3>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": true
212
+ },
213
+ "151669": {
214
+ "content": "<obj4>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "151670": {
222
+ "content": "<obj5>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": true
228
+ },
229
+ "151671": {
230
+ "content": "<obj6>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": true
236
+ },
237
+ "151672": {
238
+ "content": "<obj7>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": true
244
+ },
245
+ "151673": {
246
+ "content": "<obj8>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": true
252
+ },
253
+ "151674": {
254
+ "content": "<obj9>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": true
260
+ },
261
+ "151675": {
262
+ "content": "<obj10>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": true
268
+ },
269
+ "151676": {
270
+ "content": "<obj11>",
271
+ "lstrip": false,
272
+ "normalized": false,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": true
276
+ },
277
+ "151677": {
278
+ "content": "<obj12>",
279
+ "lstrip": false,
280
+ "normalized": false,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": true
284
+ },
285
+ "151678": {
286
+ "content": "<obj13>",
287
+ "lstrip": false,
288
+ "normalized": false,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": true
292
+ },
293
+ "151679": {
294
+ "content": "<obj14>",
295
+ "lstrip": false,
296
+ "normalized": false,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": true
300
+ },
301
+ "151680": {
302
+ "content": "<obj15>",
303
+ "lstrip": false,
304
+ "normalized": false,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": true
308
+ },
309
+ "151681": {
310
+ "content": "<obj16>",
311
+ "lstrip": false,
312
+ "normalized": false,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": true
316
+ },
317
+ "151682": {
318
+ "content": "<obj17>",
319
+ "lstrip": false,
320
+ "normalized": false,
321
+ "rstrip": false,
322
+ "single_word": false,
323
+ "special": true
324
+ },
325
+ "151683": {
326
+ "content": "<obj18>",
327
+ "lstrip": false,
328
+ "normalized": false,
329
+ "rstrip": false,
330
+ "single_word": false,
331
+ "special": true
332
+ },
333
+ "151684": {
334
+ "content": "<obj19>",
335
+ "lstrip": false,
336
+ "normalized": false,
337
+ "rstrip": false,
338
+ "single_word": false,
339
+ "special": true
340
+ },
341
+ "151685": {
342
+ "content": "<obj20>",
343
+ "lstrip": false,
344
+ "normalized": false,
345
+ "rstrip": false,
346
+ "single_word": false,
347
+ "special": true
348
+ },
349
+ "151686": {
350
+ "content": "<obj21>",
351
+ "lstrip": false,
352
+ "normalized": false,
353
+ "rstrip": false,
354
+ "single_word": false,
355
+ "special": true
356
+ },
357
+ "151687": {
358
+ "content": "<obj22>",
359
+ "lstrip": false,
360
+ "normalized": false,
361
+ "rstrip": false,
362
+ "single_word": false,
363
+ "special": true
364
+ },
365
+ "151688": {
366
+ "content": "<obj23>",
367
+ "lstrip": false,
368
+ "normalized": false,
369
+ "rstrip": false,
370
+ "single_word": false,
371
+ "special": true
372
+ },
373
+ "151689": {
374
+ "content": "<obj24>",
375
+ "lstrip": false,
376
+ "normalized": false,
377
+ "rstrip": false,
378
+ "single_word": false,
379
+ "special": true
380
+ },
381
+ "151690": {
382
+ "content": "<obj25>",
383
+ "lstrip": false,
384
+ "normalized": false,
385
+ "rstrip": false,
386
+ "single_word": false,
387
+ "special": true
388
+ },
389
+ "151691": {
390
+ "content": "<obj26>",
391
+ "lstrip": false,
392
+ "normalized": false,
393
+ "rstrip": false,
394
+ "single_word": false,
395
+ "special": true
396
+ },
397
+ "151692": {
398
+ "content": "<obj27>",
399
+ "lstrip": false,
400
+ "normalized": false,
401
+ "rstrip": false,
402
+ "single_word": false,
403
+ "special": true
404
+ },
405
+ "151693": {
406
+ "content": "<obj28>",
407
+ "lstrip": false,
408
+ "normalized": false,
409
+ "rstrip": false,
410
+ "single_word": false,
411
+ "special": true
412
+ },
413
+ "151694": {
414
+ "content": "<obj29>",
415
+ "lstrip": false,
416
+ "normalized": false,
417
+ "rstrip": false,
418
+ "single_word": false,
419
+ "special": true
420
+ },
421
+ "151695": {
422
+ "content": "<obj30>",
423
+ "lstrip": false,
424
+ "normalized": false,
425
+ "rstrip": false,
426
+ "single_word": false,
427
+ "special": true
428
+ },
429
+ "151696": {
430
+ "content": "<obj31>",
431
+ "lstrip": false,
432
+ "normalized": false,
433
+ "rstrip": false,
434
+ "single_word": false,
435
+ "special": true
436
+ },
437
+ "151697": {
438
+ "content": "<obj32>",
439
+ "lstrip": false,
440
+ "normalized": false,
441
+ "rstrip": false,
442
+ "single_word": false,
443
+ "special": true
444
+ },
445
+ "151698": {
446
+ "content": "<obj33>",
447
+ "lstrip": false,
448
+ "normalized": false,
449
+ "rstrip": false,
450
+ "single_word": false,
451
+ "special": true
452
+ },
453
+ "151699": {
454
+ "content": "<obj34>",
455
+ "lstrip": false,
456
+ "normalized": false,
457
+ "rstrip": false,
458
+ "single_word": false,
459
+ "special": true
460
+ },
461
+ "151700": {
462
+ "content": "<obj35>",
463
+ "lstrip": false,
464
+ "normalized": false,
465
+ "rstrip": false,
466
+ "single_word": false,
467
+ "special": true
468
+ },
469
+ "151701": {
470
+ "content": "<obj36>",
471
+ "lstrip": false,
472
+ "normalized": false,
473
+ "rstrip": false,
474
+ "single_word": false,
475
+ "special": true
476
+ },
477
+ "151702": {
478
+ "content": "<obj37>",
479
+ "lstrip": false,
480
+ "normalized": false,
481
+ "rstrip": false,
482
+ "single_word": false,
483
+ "special": true
484
+ },
485
+ "151703": {
486
+ "content": "<obj38>",
487
+ "lstrip": false,
488
+ "normalized": false,
489
+ "rstrip": false,
490
+ "single_word": false,
491
+ "special": true
492
+ },
493
+ "151704": {
494
+ "content": "<obj39>",
495
+ "lstrip": false,
496
+ "normalized": false,
497
+ "rstrip": false,
498
+ "single_word": false,
499
+ "special": true
500
+ },
501
+ "151705": {
502
+ "content": "<obj40>",
503
+ "lstrip": false,
504
+ "normalized": false,
505
+ "rstrip": false,
506
+ "single_word": false,
507
+ "special": true
508
+ },
509
+ "151706": {
510
+ "content": "<obj41>",
511
+ "lstrip": false,
512
+ "normalized": false,
513
+ "rstrip": false,
514
+ "single_word": false,
515
+ "special": true
516
+ },
517
+ "151707": {
518
+ "content": "<obj42>",
519
+ "lstrip": false,
520
+ "normalized": false,
521
+ "rstrip": false,
522
+ "single_word": false,
523
+ "special": true
524
+ },
525
+ "151708": {
526
+ "content": "<obj43>",
527
+ "lstrip": false,
528
+ "normalized": false,
529
+ "rstrip": false,
530
+ "single_word": false,
531
+ "special": true
532
+ },
533
+ "151709": {
534
+ "content": "<obj44>",
535
+ "lstrip": false,
536
+ "normalized": false,
537
+ "rstrip": false,
538
+ "single_word": false,
539
+ "special": true
540
+ },
541
+ "151710": {
542
+ "content": "<obj45>",
543
+ "lstrip": false,
544
+ "normalized": false,
545
+ "rstrip": false,
546
+ "single_word": false,
547
+ "special": true
548
+ },
549
+ "151711": {
550
+ "content": "<obj46>",
551
+ "lstrip": false,
552
+ "normalized": false,
553
+ "rstrip": false,
554
+ "single_word": false,
555
+ "special": true
556
+ },
557
+ "151712": {
558
+ "content": "<obj47>",
559
+ "lstrip": false,
560
+ "normalized": false,
561
+ "rstrip": false,
562
+ "single_word": false,
563
+ "special": true
564
+ },
565
+ "151713": {
566
+ "content": "<obj48>",
567
+ "lstrip": false,
568
+ "normalized": false,
569
+ "rstrip": false,
570
+ "single_word": false,
571
+ "special": true
572
+ },
573
+ "151714": {
574
+ "content": "<obj49>",
575
+ "lstrip": false,
576
+ "normalized": false,
577
+ "rstrip": false,
578
+ "single_word": false,
579
+ "special": true
580
+ },
581
+ "151715": {
582
+ "content": "<obj50>",
583
+ "lstrip": false,
584
+ "normalized": false,
585
+ "rstrip": false,
586
+ "single_word": false,
587
+ "special": true
588
+ },
589
+ "151716": {
590
+ "content": "<obj51>",
591
+ "lstrip": false,
592
+ "normalized": false,
593
+ "rstrip": false,
594
+ "single_word": false,
595
+ "special": true
596
+ },
597
+ "151717": {
598
+ "content": "<obj52>",
599
+ "lstrip": false,
600
+ "normalized": false,
601
+ "rstrip": false,
602
+ "single_word": false,
603
+ "special": true
604
+ },
605
+ "151718": {
606
+ "content": "<obj53>",
607
+ "lstrip": false,
608
+ "normalized": false,
609
+ "rstrip": false,
610
+ "single_word": false,
611
+ "special": true
612
+ },
613
+ "151719": {
614
+ "content": "<obj54>",
615
+ "lstrip": false,
616
+ "normalized": false,
617
+ "rstrip": false,
618
+ "single_word": false,
619
+ "special": true
620
+ },
621
+ "151720": {
622
+ "content": "<obj55>",
623
+ "lstrip": false,
624
+ "normalized": false,
625
+ "rstrip": false,
626
+ "single_word": false,
627
+ "special": true
628
+ },
629
+ "151721": {
630
+ "content": "<obj56>",
631
+ "lstrip": false,
632
+ "normalized": false,
633
+ "rstrip": false,
634
+ "single_word": false,
635
+ "special": true
636
+ },
637
+ "151722": {
638
+ "content": "<obj57>",
639
+ "lstrip": false,
640
+ "normalized": false,
641
+ "rstrip": false,
642
+ "single_word": false,
643
+ "special": true
644
+ },
645
+ "151723": {
646
+ "content": "<obj58>",
647
+ "lstrip": false,
648
+ "normalized": false,
649
+ "rstrip": false,
650
+ "single_word": false,
651
+ "special": true
652
+ },
653
+ "151724": {
654
+ "content": "<obj59>",
655
+ "lstrip": false,
656
+ "normalized": false,
657
+ "rstrip": false,
658
+ "single_word": false,
659
+ "special": true
660
+ },
661
+ "151725": {
662
+ "content": "<obj60>",
663
+ "lstrip": false,
664
+ "normalized": false,
665
+ "rstrip": false,
666
+ "single_word": false,
667
+ "special": true
668
+ },
669
+ "151726": {
670
+ "content": "<obj61>",
671
+ "lstrip": false,
672
+ "normalized": false,
673
+ "rstrip": false,
674
+ "single_word": false,
675
+ "special": true
676
+ },
677
+ "151727": {
678
+ "content": "<obj62>",
679
+ "lstrip": false,
680
+ "normalized": false,
681
+ "rstrip": false,
682
+ "single_word": false,
683
+ "special": true
684
+ },
685
+ "151728": {
686
+ "content": "<obj63>",
687
+ "lstrip": false,
688
+ "normalized": false,
689
+ "rstrip": false,
690
+ "single_word": false,
691
+ "special": true
692
+ },
693
+ "151729": {
694
+ "content": "<obj64>",
695
+ "lstrip": false,
696
+ "normalized": false,
697
+ "rstrip": false,
698
+ "single_word": false,
699
+ "special": true
700
+ },
701
+ "151730": {
702
+ "content": "<obj65>",
703
+ "lstrip": false,
704
+ "normalized": false,
705
+ "rstrip": false,
706
+ "single_word": false,
707
+ "special": true
708
+ },
709
+ "151731": {
710
+ "content": "<obj66>",
711
+ "lstrip": false,
712
+ "normalized": false,
713
+ "rstrip": false,
714
+ "single_word": false,
715
+ "special": true
716
+ },
717
+ "151732": {
718
+ "content": "<obj67>",
719
+ "lstrip": false,
720
+ "normalized": false,
721
+ "rstrip": false,
722
+ "single_word": false,
723
+ "special": true
724
+ },
725
+ "151733": {
726
+ "content": "<obj68>",
727
+ "lstrip": false,
728
+ "normalized": false,
729
+ "rstrip": false,
730
+ "single_word": false,
731
+ "special": true
732
+ },
733
+ "151734": {
734
+ "content": "<obj69>",
735
+ "lstrip": false,
736
+ "normalized": false,
737
+ "rstrip": false,
738
+ "single_word": false,
739
+ "special": true
740
+ },
741
+ "151735": {
742
+ "content": "<obj70>",
743
+ "lstrip": false,
744
+ "normalized": false,
745
+ "rstrip": false,
746
+ "single_word": false,
747
+ "special": true
748
+ },
749
+ "151736": {
750
+ "content": "<obj71>",
751
+ "lstrip": false,
752
+ "normalized": false,
753
+ "rstrip": false,
754
+ "single_word": false,
755
+ "special": true
756
+ },
757
+ "151737": {
758
+ "content": "<obj72>",
759
+ "lstrip": false,
760
+ "normalized": false,
761
+ "rstrip": false,
762
+ "single_word": false,
763
+ "special": true
764
+ },
765
+ "151738": {
766
+ "content": "<obj73>",
767
+ "lstrip": false,
768
+ "normalized": false,
769
+ "rstrip": false,
770
+ "single_word": false,
771
+ "special": true
772
+ },
773
+ "151739": {
774
+ "content": "<obj74>",
775
+ "lstrip": false,
776
+ "normalized": false,
777
+ "rstrip": false,
778
+ "single_word": false,
779
+ "special": true
780
+ },
781
+ "151740": {
782
+ "content": "<obj75>",
783
+ "lstrip": false,
784
+ "normalized": false,
785
+ "rstrip": false,
786
+ "single_word": false,
787
+ "special": true
788
+ },
789
+ "151741": {
790
+ "content": "<obj76>",
791
+ "lstrip": false,
792
+ "normalized": false,
793
+ "rstrip": false,
794
+ "single_word": false,
795
+ "special": true
796
+ },
797
+ "151742": {
798
+ "content": "<obj77>",
799
+ "lstrip": false,
800
+ "normalized": false,
801
+ "rstrip": false,
802
+ "single_word": false,
803
+ "special": true
804
+ },
805
+ "151743": {
806
+ "content": "<obj78>",
807
+ "lstrip": false,
808
+ "normalized": false,
809
+ "rstrip": false,
810
+ "single_word": false,
811
+ "special": true
812
+ },
813
+ "151744": {
814
+ "content": "<obj79>",
815
+ "lstrip": false,
816
+ "normalized": false,
817
+ "rstrip": false,
818
+ "single_word": false,
819
+ "special": true
820
+ },
821
+ "151745": {
822
+ "content": "<obj80>",
823
+ "lstrip": false,
824
+ "normalized": false,
825
+ "rstrip": false,
826
+ "single_word": false,
827
+ "special": true
828
+ },
829
+ "151746": {
830
+ "content": "<obj81>",
831
+ "lstrip": false,
832
+ "normalized": false,
833
+ "rstrip": false,
834
+ "single_word": false,
835
+ "special": true
836
+ },
837
+ "151747": {
838
+ "content": "<obj82>",
839
+ "lstrip": false,
840
+ "normalized": false,
841
+ "rstrip": false,
842
+ "single_word": false,
843
+ "special": true
844
+ },
845
+ "151748": {
846
+ "content": "<obj83>",
847
+ "lstrip": false,
848
+ "normalized": false,
849
+ "rstrip": false,
850
+ "single_word": false,
851
+ "special": true
852
+ },
853
+ "151749": {
854
+ "content": "<obj84>",
855
+ "lstrip": false,
856
+ "normalized": false,
857
+ "rstrip": false,
858
+ "single_word": false,
859
+ "special": true
860
+ },
861
+ "151750": {
862
+ "content": "<obj85>",
863
+ "lstrip": false,
864
+ "normalized": false,
865
+ "rstrip": false,
866
+ "single_word": false,
867
+ "special": true
868
+ },
869
+ "151751": {
870
+ "content": "<obj86>",
871
+ "lstrip": false,
872
+ "normalized": false,
873
+ "rstrip": false,
874
+ "single_word": false,
875
+ "special": true
876
+ },
877
+ "151752": {
878
+ "content": "<obj87>",
879
+ "lstrip": false,
880
+ "normalized": false,
881
+ "rstrip": false,
882
+ "single_word": false,
883
+ "special": true
884
+ },
885
+ "151753": {
886
+ "content": "<obj88>",
887
+ "lstrip": false,
888
+ "normalized": false,
889
+ "rstrip": false,
890
+ "single_word": false,
891
+ "special": true
892
+ },
893
+ "151754": {
894
+ "content": "<obj89>",
895
+ "lstrip": false,
896
+ "normalized": false,
897
+ "rstrip": false,
898
+ "single_word": false,
899
+ "special": true
900
+ },
901
+ "151755": {
902
+ "content": "<obj90>",
903
+ "lstrip": false,
904
+ "normalized": false,
905
+ "rstrip": false,
906
+ "single_word": false,
907
+ "special": true
908
+ },
909
+ "151756": {
910
+ "content": "<obj91>",
911
+ "lstrip": false,
912
+ "normalized": false,
913
+ "rstrip": false,
914
+ "single_word": false,
915
+ "special": true
916
+ },
917
+ "151757": {
918
+ "content": "<obj92>",
919
+ "lstrip": false,
920
+ "normalized": false,
921
+ "rstrip": false,
922
+ "single_word": false,
923
+ "special": true
924
+ },
925
+ "151758": {
926
+ "content": "<obj93>",
927
+ "lstrip": false,
928
+ "normalized": false,
929
+ "rstrip": false,
930
+ "single_word": false,
931
+ "special": true
932
+ },
933
+ "151759": {
934
+ "content": "<obj94>",
935
+ "lstrip": false,
936
+ "normalized": false,
937
+ "rstrip": false,
938
+ "single_word": false,
939
+ "special": true
940
+ },
941
+ "151760": {
942
+ "content": "<obj95>",
943
+ "lstrip": false,
944
+ "normalized": false,
945
+ "rstrip": false,
946
+ "single_word": false,
947
+ "special": true
948
+ },
949
+ "151761": {
950
+ "content": "<obj96>",
951
+ "lstrip": false,
952
+ "normalized": false,
953
+ "rstrip": false,
954
+ "single_word": false,
955
+ "special": true
956
+ },
957
+ "151762": {
958
+ "content": "<obj97>",
959
+ "lstrip": false,
960
+ "normalized": false,
961
+ "rstrip": false,
962
+ "single_word": false,
963
+ "special": true
964
+ },
965
+ "151763": {
966
+ "content": "<obj98>",
967
+ "lstrip": false,
968
+ "normalized": false,
969
+ "rstrip": false,
970
+ "single_word": false,
971
+ "special": true
972
+ },
973
+ "151764": {
974
+ "content": "<obj99>",
975
+ "lstrip": false,
976
+ "normalized": false,
977
+ "rstrip": false,
978
+ "single_word": false,
979
+ "special": true
980
+ },
981
+ "151765": {
982
+ "content": "<ground>",
983
+ "lstrip": false,
984
+ "normalized": false,
985
+ "rstrip": false,
986
+ "single_word": false,
987
+ "special": true
988
+ },
989
+ "151766": {
990
+ "content": "</ground>",
991
+ "lstrip": false,
992
+ "normalized": false,
993
+ "rstrip": false,
994
+ "single_word": false,
995
+ "special": true
996
+ },
997
+ "151767": {
998
+ "content": "<objects>",
999
+ "lstrip": false,
1000
+ "normalized": false,
1001
+ "rstrip": false,
1002
+ "single_word": false,
1003
+ "special": true
1004
+ },
1005
+ "151768": {
1006
+ "content": "</objects>",
1007
+ "lstrip": false,
1008
+ "normalized": false,
1009
+ "rstrip": false,
1010
+ "single_word": false,
1011
+ "special": true
1012
+ }
1013
+ },
1014
+ "additional_special_tokens": [
1015
+ "<|im_start|>",
1016
+ "<|im_end|>",
1017
+ "<|object_ref_start|>",
1018
+ "<|object_ref_end|>",
1019
+ "<|box_start|>",
1020
+ "<|box_end|>",
1021
+ "<|quad_start|>",
1022
+ "<|quad_end|>",
1023
+ "<|vision_start|>",
1024
+ "<|vision_end|>",
1025
+ "<|vision_pad|>",
1026
+ "<|image_pad|>",
1027
+ "<|video_pad|>",
1028
+ "<obj0>",
1029
+ "<obj1>",
1030
+ "<obj2>",
1031
+ "<obj3>",
1032
+ "<obj4>",
1033
+ "<obj5>",
1034
+ "<obj6>",
1035
+ "<obj7>",
1036
+ "<obj8>",
1037
+ "<obj9>",
1038
+ "<obj10>",
1039
+ "<obj11>",
1040
+ "<obj12>",
1041
+ "<obj13>",
1042
+ "<obj14>",
1043
+ "<obj15>",
1044
+ "<obj16>",
1045
+ "<obj17>",
1046
+ "<obj18>",
1047
+ "<obj19>",
1048
+ "<obj20>",
1049
+ "<obj21>",
1050
+ "<obj22>",
1051
+ "<obj23>",
1052
+ "<obj24>",
1053
+ "<obj25>",
1054
+ "<obj26>",
1055
+ "<obj27>",
1056
+ "<obj28>",
1057
+ "<obj29>",
1058
+ "<obj30>",
1059
+ "<obj31>",
1060
+ "<obj32>",
1061
+ "<obj33>",
1062
+ "<obj34>",
1063
+ "<obj35>",
1064
+ "<obj36>",
1065
+ "<obj37>",
1066
+ "<obj38>",
1067
+ "<obj39>",
1068
+ "<obj40>",
1069
+ "<obj41>",
1070
+ "<obj42>",
1071
+ "<obj43>",
1072
+ "<obj44>",
1073
+ "<obj45>",
1074
+ "<obj46>",
1075
+ "<obj47>",
1076
+ "<obj48>",
1077
+ "<obj49>",
1078
+ "<obj50>",
1079
+ "<obj51>",
1080
+ "<obj52>",
1081
+ "<obj53>",
1082
+ "<obj54>",
1083
+ "<obj55>",
1084
+ "<obj56>",
1085
+ "<obj57>",
1086
+ "<obj58>",
1087
+ "<obj59>",
1088
+ "<obj60>",
1089
+ "<obj61>",
1090
+ "<obj62>",
1091
+ "<obj63>",
1092
+ "<obj64>",
1093
+ "<obj65>",
1094
+ "<obj66>",
1095
+ "<obj67>",
1096
+ "<obj68>",
1097
+ "<obj69>",
1098
+ "<obj70>",
1099
+ "<obj71>",
1100
+ "<obj72>",
1101
+ "<obj73>",
1102
+ "<obj74>",
1103
+ "<obj75>",
1104
+ "<obj76>",
1105
+ "<obj77>",
1106
+ "<obj78>",
1107
+ "<obj79>",
1108
+ "<obj80>",
1109
+ "<obj81>",
1110
+ "<obj82>",
1111
+ "<obj83>",
1112
+ "<obj84>",
1113
+ "<obj85>",
1114
+ "<obj86>",
1115
+ "<obj87>",
1116
+ "<obj88>",
1117
+ "<obj89>",
1118
+ "<obj90>",
1119
+ "<obj91>",
1120
+ "<obj92>",
1121
+ "<obj93>",
1122
+ "<obj94>",
1123
+ "<obj95>",
1124
+ "<obj96>",
1125
+ "<obj97>",
1126
+ "<obj98>",
1127
+ "<obj99>",
1128
+ "<ground>",
1129
+ "</ground>",
1130
+ "<objects>",
1131
+ "</objects>"
1132
+ ],
1133
+ "bos_token": null,
1134
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
1135
+ "clean_up_tokenization_spaces": false,
1136
+ "eos_token": "<|im_end|>",
1137
+ "errors": "replace",
1138
+ "extra_special_tokens": {},
1139
+ "model_max_length": 2048,
1140
+ "pad_token": null,
1141
+ "padding_side": "right",
1142
+ "split_special_tokens": false,
1143
+ "tokenizer_class": "Qwen2Tokenizer",
1144
+ "unk_token": null
1145
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff