Upload 16 files

Browse files

Files changed (16) hide show

added_tokens.json +128 -0
clip.py +127 -0
config.json +46 -0
convnext.py +697 -0
generation_config.json +16 -0
merges.txt +0 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +0 -0
modeling_rexseek.py +666 -0
preprocessing_rexseek.py +259 -0
preprocessor_config.json +28 -0
processor_config.json +6 -0
special_tokens_map.json +128 -0
tokenizer_config.json +1145 -0
vocab.json +0 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,128 @@

+{
+  "</ground>": 151766,
+  "</objects>": 151768,
+  "</tool_call>": 151658,
+  "<ground>": 151765,
+  "<obj0>": 151665,
+  "<obj10>": 151675,
+  "<obj11>": 151676,
+  "<obj12>": 151677,
+  "<obj13>": 151678,
+  "<obj14>": 151679,
+  "<obj15>": 151680,
+  "<obj16>": 151681,
+  "<obj17>": 151682,
+  "<obj18>": 151683,
+  "<obj19>": 151684,
+  "<obj1>": 151666,
+  "<obj20>": 151685,
+  "<obj21>": 151686,
+  "<obj22>": 151687,
+  "<obj23>": 151688,
+  "<obj24>": 151689,
+  "<obj25>": 151690,
+  "<obj26>": 151691,
+  "<obj27>": 151692,
+  "<obj28>": 151693,
+  "<obj29>": 151694,
+  "<obj2>": 151667,
+  "<obj30>": 151695,
+  "<obj31>": 151696,
+  "<obj32>": 151697,
+  "<obj33>": 151698,
+  "<obj34>": 151699,
+  "<obj35>": 151700,
+  "<obj36>": 151701,
+  "<obj37>": 151702,
+  "<obj38>": 151703,
+  "<obj39>": 151704,
+  "<obj3>": 151668,
+  "<obj40>": 151705,
+  "<obj41>": 151706,
+  "<obj42>": 151707,
+  "<obj43>": 151708,
+  "<obj44>": 151709,
+  "<obj45>": 151710,
+  "<obj46>": 151711,
+  "<obj47>": 151712,
+  "<obj48>": 151713,
+  "<obj49>": 151714,
+  "<obj4>": 151669,
+  "<obj50>": 151715,
+  "<obj51>": 151716,
+  "<obj52>": 151717,
+  "<obj53>": 151718,
+  "<obj54>": 151719,
+  "<obj55>": 151720,
+  "<obj56>": 151721,
+  "<obj57>": 151722,
+  "<obj58>": 151723,
+  "<obj59>": 151724,
+  "<obj5>": 151670,
+  "<obj60>": 151725,
+  "<obj61>": 151726,
+  "<obj62>": 151727,
+  "<obj63>": 151728,
+  "<obj64>": 151729,
+  "<obj65>": 151730,
+  "<obj66>": 151731,
+  "<obj67>": 151732,
+  "<obj68>": 151733,
+  "<obj69>": 151734,
+  "<obj6>": 151671,
+  "<obj70>": 151735,
+  "<obj71>": 151736,
+  "<obj72>": 151737,
+  "<obj73>": 151738,
+  "<obj74>": 151739,
+  "<obj75>": 151740,
+  "<obj76>": 151741,
+  "<obj77>": 151742,
+  "<obj78>": 151743,
+  "<obj79>": 151744,
+  "<obj7>": 151672,
+  "<obj80>": 151745,
+  "<obj81>": 151746,
+  "<obj82>": 151747,
+  "<obj83>": 151748,
+  "<obj84>": 151749,
+  "<obj85>": 151750,
+  "<obj86>": 151751,
+  "<obj87>": 151752,
+  "<obj88>": 151753,
+  "<obj89>": 151754,
+  "<obj8>": 151673,
+  "<obj90>": 151755,
+  "<obj91>": 151756,
+  "<obj92>": 151757,
+  "<obj93>": 151758,
+  "<obj94>": 151759,
+  "<obj95>": 151760,
+  "<obj96>": 151761,
+  "<obj97>": 151762,
+  "<obj98>": 151763,
+  "<obj99>": 151764,
+  "<obj9>": 151674,
+  "<objects>": 151767,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

clip.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import torch
+import torch.nn as nn
+from transformers import CLIPImageProcessor, CLIPVisionConfig, CLIPVisionModel
+class CLIPVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, freeze_vision_tower=False, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, "mm_vision_select_feature", "patch")
+        self.freeze_vision_tower = freeze_vision_tower
+        if not delay_load:
+            self.load_model()
+        elif getattr(args, "unfreeze_mm_vision_tower", False):
+            self.load_model()
+        else:
+            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
+    def load_model(self, device_map=None):
+        if self.is_loaded:
+            print(
+                "{} is already loaded, `load_model` called again, skipping.".format(
+                    self.vision_tower_name
+                )
+            )
+            return
+        self.image_processor = CLIPImageProcessor.from_pretrained(
+            self.vision_tower_name
+        )
+        self.vision_tower = CLIPVisionModel.from_pretrained(
+            self.vision_tower_name, device_map=device_map
+        )
+        if self.freeze_vision_tower:
+            self.vision_tower.requires_grad_(False)
+        self.is_loaded = True
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == "patch":
+            image_features = image_features[:, 1:]
+        elif self.select_feature == "cls_patch":
+            image_features = image_features
+        else:
+            raise ValueError(f"Unexpected select feature: {self.select_feature}")
+        return image_features
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                if self.freeze_vision_tower:
+                    with torch.no_grad():
+                        image_forward_out = self.vision_tower(
+                            image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
+                            output_hidden_states=True,
+                        )
+                        image_feature = self.feature_select(image_forward_out).to(
+                            image.dtype
+                        )
+                        image_features.append(image_feature)
+                else:
+                    image_forward_out = self.vision_tower(
+                        image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
+                        output_hidden_states=True,
+                    )
+                    image_feature = self.feature_select(image_forward_out).to(
+                        image.dtype
+                    )
+                    image_features.append(image_feature)
+        else:
+            if self.freeze_vision_tower:
+                with torch.no_grad():
+                    image_forward_out = self.vision_tower(
+                        images.to(device=self.device, dtype=self.dtype),
+                        output_hidden_states=True,
+                    )
+                    image_features = self.feature_select(image_forward_out).to(
+                        images.dtype
+                    )
+            else:
+                image_forward_outs = self.vision_tower(
+                    images.to(device=self.device, dtype=self.dtype),
+                    output_hidden_states=True,
+                )
+                image_features = self.feature_select(image_forward_outs).to(
+                    images.dtype
+                )
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+    @property
+    def device(self):
+        return self.vision_tower.device
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+    @property
+    def num_patches_per_side(self):
+        return self.config.image_size // self.config.patch_size
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2

config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "architectures": [
+    "RexSeekQwenForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "modeling_rexseek.RexSeekQwenConfig",
+    "AutoModelForCausalLM": "modeling_rexseek.RexSeekQwenForCausalLM"
+  },
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "image_aspect_ratio": "pad",
+  "initializer_range": 0.02,
+  "intermediate_size": 11008,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 70,
+  "mm_hidden_size": 2560,
+  "mm_patch_merge_type": "flat",
+  "mm_projector_lr": null,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_select_feature": "patch",
+  "mm_vision_select_layer": -2,
+  "mm_vision_tower": "openai/clip-vit-large-patch14-336",
+  "model_type": "rexseek_qwen",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 2,
+  "object_hidden_size": 2880,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "tokenizer_model_max_length": 2048,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.48.0",
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_sliding_window": false,
+  "vis_during_training_prob": 0.0,
+  "vocab_size": 151769
+}

convnext.py ADDED Viewed

	@@ -0,0 +1,697 @@

+from functools import partial
+from typing import Callable, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from open_clip.factory import get_model_config
+from open_clip.model import CLIPVisionCfg
+from timm.layers import (
+    AvgPool2dSame,
+    ClassifierHead,
+    DropPath,
+    GlobalResponseNormMlp,
+    LayerNorm,
+    LayerNorm2d,
+    Mlp,
+    NormMlpClassifierHead,
+    create_conv2d,
+    get_act_layer,
+    make_divisible,
+    to_ntuple,
+    trunc_normal_,
+)
+from timm.models._builder import build_model_with_cfg
+from timm.models._features import feature_take_indices
+from timm.models._manipulate import checkpoint_seq, named_apply
+__all__ = ["ConvNeXt"]  # model_registry will add each entrypoint fn to this
+class Downsample(nn.Module):
+    def __init__(self, in_chs, out_chs, stride=1, dilation=1):
+        super().__init__()
+        avg_stride = stride if dilation == 1 else 1
+        if stride > 1 or dilation > 1:
+            avg_pool_fn = (
+                AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
+            )
+            self.pool = avg_pool_fn(
+                2, avg_stride, ceil_mode=True, count_include_pad=False
+            )
+        else:
+            self.pool = nn.Identity()
+        if in_chs != out_chs:
+            self.conv = create_conv2d(in_chs, out_chs, 1, stride=1)
+        else:
+            self.conv = nn.Identity()
+    def forward(self, x):
+        x = self.pool(x)
+        x = self.conv(x)
+        return x
+class ConvNeXtBlock(nn.Module):
+    """ConvNeXt Block
+    There are two equivalent implementations:
+      (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+      (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    Unlike the official impl, this one allows choice of 1 or 2, 1x1 conv can be faster with appropriate
+    choice of LayerNorm impl, however as model size increases the tradeoffs appear to change and nn.Linear
+    is a better choice. This was observed with PyTorch 1.10 on 3090 GPU, it could change over time & w/ different HW.
+    """
+    def __init__(
+        self,
+        in_chs: int,
+        out_chs: Optional[int] = None,
+        kernel_size: int = 7,
+        stride: int = 1,
+        dilation: Union[int, Tuple[int, int]] = (1, 1),
+        mlp_ratio: float = 4,
+        conv_mlp: bool = False,
+        conv_bias: bool = True,
+        use_grn: bool = False,
+        ls_init_value: Optional[float] = 1e-6,
+        act_layer: Union[str, Callable] = "gelu",
+        norm_layer: Optional[Callable] = None,
+        drop_path: float = 0.0,
+    ):
+        """
+        Args:
+            in_chs: Block input channels.
+            out_chs: Block output channels (same as in_chs if None).
+            kernel_size: Depthwise convolution kernel size.
+            stride: Stride of depthwise convolution.
+            dilation: Tuple specifying input and output dilation of block.
+            mlp_ratio: MLP expansion ratio.
+            conv_mlp: Use 1x1 convolutions for MLP and a NCHW compatible norm layer if True.
+            conv_bias: Apply bias for all convolution (linear) layers.
+            use_grn: Use GlobalResponseNorm in MLP (from ConvNeXt-V2)
+            ls_init_value: Layer-scale init values, layer-scale applied if not None.
+            act_layer: Activation layer.
+            norm_layer: Normalization layer (defaults to LN if not specified).
+            drop_path: Stochastic depth probability.
+        """
+        super().__init__()
+        out_chs = out_chs or in_chs
+        dilation = to_ntuple(2)(dilation)
+        act_layer = get_act_layer(act_layer)
+        if not norm_layer:
+            norm_layer = LayerNorm2d if conv_mlp else LayerNorm
+        mlp_layer = partial(
+            GlobalResponseNormMlp if use_grn else Mlp, use_conv=conv_mlp
+        )
+        self.use_conv_mlp = conv_mlp
+        self.conv_dw = create_conv2d(
+            in_chs,
+            out_chs,
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation[0],
+            depthwise=True,
+            bias=conv_bias,
+        )
+        self.norm = norm_layer(out_chs)
+        self.mlp = mlp_layer(out_chs, int(mlp_ratio * out_chs), act_layer=act_layer)
+        self.ramma = (
+            nn.Parameter(ls_init_value * torch.ones(out_chs))
+            if ls_init_value is not None
+            else None
+        )
+        if in_chs != out_chs or stride != 1 or dilation[0] != dilation[1]:
+            self.shortcut = Downsample(
+                in_chs, out_chs, stride=stride, dilation=dilation[0]
+            )
+        else:
+            self.shortcut = nn.Identity()
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+    def forward(self, x):
+        shortcut = x
+        x = self.conv_dw(x)
+        if self.use_conv_mlp:
+            x = self.norm(x)
+            x = self.mlp(x)
+        else:
+            x = x.permute(0, 2, 3, 1)
+            x = self.norm(x)
+            x = self.mlp(x)
+            x = x.permute(0, 3, 1, 2)
+        if self.ramma is not None:
+            x = x.mul(self.ramma.reshape(1, -1, 1, 1))
+        x = self.drop_path(x) + self.shortcut(shortcut)
+        return x
+class ConvNeXtStage(nn.Module):
+    def __init__(
+        self,
+        in_chs,
+        out_chs,
+        kernel_size=7,
+        stride=2,
+        depth=2,
+        dilation=(1, 1),
+        drop_path_rates=None,
+        ls_init_value=1.0,
+        conv_mlp=False,
+        conv_bias=True,
+        use_grn=False,
+        act_layer="gelu",
+        norm_layer=None,
+        norm_layer_cl=None,
+    ):
+        super().__init__()
+        self.grad_checkpointing = False
+        if in_chs != out_chs or stride > 1 or dilation[0] != dilation[1]:
+            ds_ks = 2 if stride > 1 or dilation[0] != dilation[1] else 1
+            pad = (
+                "same" if dilation[1] > 1 else 0
+            )  # same padding needed if dilation used
+            self.downsample = nn.Sequential(
+                norm_layer(in_chs),
+                create_conv2d(
+                    in_chs,
+                    out_chs,
+                    kernel_size=ds_ks,
+                    stride=stride,
+                    dilation=dilation[0],
+                    padding=pad,
+                    bias=conv_bias,
+                ),
+            )
+            in_chs = out_chs
+        else:
+            self.downsample = nn.Identity()
+        drop_path_rates = drop_path_rates or [0.0] * depth
+        stage_blocks = []
+        for i in range(depth):
+            stage_blocks.append(
+                ConvNeXtBlock(
+                    in_chs=in_chs,
+                    out_chs=out_chs,
+                    kernel_size=kernel_size,
+                    dilation=dilation[1],
+                    drop_path=drop_path_rates[i],
+                    ls_init_value=ls_init_value,
+                    conv_mlp=conv_mlp,
+                    conv_bias=conv_bias,
+                    use_grn=use_grn,
+                    act_layer=act_layer,
+                    norm_layer=norm_layer if conv_mlp else norm_layer_cl,
+                )
+            )
+            in_chs = out_chs
+        self.blocks = nn.Sequential(*stage_blocks)
+    def forward(self, x):
+        x = self.downsample(x)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.blocks, x)
+        else:
+            x = self.blocks(x)
+        return x
+class ConvNeXt(nn.Module):
+    r"""ConvNeXt
+    A PyTorch impl of : `A ConvNet for the 2020s`  - https://arxiv.org/pdf/2201.03545.pdf
+    """
+    def __init__(
+        self,
+        in_chans: int = 3,
+        num_classes: int = 1000,
+        global_pool: str = "avg",
+        output_stride: int = 32,
+        depths: Tuple[int, ...] = (3, 3, 9, 3),
+        dims: Tuple[int, ...] = (96, 192, 384, 768),
+        kernel_sizes: Union[int, Tuple[int, ...]] = 7,
+        ls_init_value: Optional[float] = 1e-6,
+        stem_type: str = "patch",
+        patch_size: int = 4,
+        head_init_scale: float = 1.0,
+        head_norm_first: bool = False,
+        head_hidden_size: Optional[int] = None,
+        conv_mlp: bool = False,
+        conv_bias: bool = True,
+        use_grn: bool = False,
+        act_layer: Union[str, Callable] = "gelu",
+        norm_layer: Optional[Union[str, Callable]] = None,
+        norm_eps: Optional[float] = None,
+        drop_rate: float = 0.0,
+        drop_path_rate: float = 0.0,
+    ):
+        """
+        Args:
+            in_chans: Number of input image channels.
+            num_classes: Number of classes for classification head.
+            global_pool: Global pooling type.
+            output_stride: Output stride of network, one of (8, 16, 32).
+            depths: Number of blocks at each stage.
+            dims: Feature dimension at each stage.
+            kernel_sizes: Depthwise convolution kernel-sizes for each stage.
+            ls_init_value: Init value for Layer Scale, disabled if None.
+            stem_type: Type of stem.
+            patch_size: Stem patch size for patch stem.
+            head_init_scale: Init scaling value for classifier weights and biases.
+            head_norm_first: Apply normalization before global pool + head.
+            head_hidden_size: Size of MLP hidden layer in head if not None and head_norm_first == False.
+            conv_mlp: Use 1x1 conv in MLP, improves speed for small networks w/ chan last.
+            conv_bias: Use bias layers w/ all convolutions.
+            use_grn: Use Global Response Norm (ConvNeXt-V2) in MLP.
+            act_layer: Activation layer type.
+            norm_layer: Normalization layer type.
+            drop_rate: Head pre-classifier dropout rate.
+            drop_path_rate: Stochastic depth drop rate.
+        """
+        super().__init__()
+        assert output_stride in (8, 16, 32)
+        kernel_sizes = to_ntuple(4)(kernel_sizes)
+        if norm_layer is None:
+            norm_layer = LayerNorm2d
+            norm_layer_cl = norm_layer if conv_mlp else LayerNorm
+            if norm_eps is not None:
+                norm_layer = partial(norm_layer, eps=norm_eps)
+                norm_layer_cl = partial(norm_layer_cl, eps=norm_eps)
+        else:
+            assert (
+                conv_mlp
+            ), "If a norm_layer is specified, conv MLP must be used so all norm expect rank-4, channels-first input"
+            norm_layer_cl = norm_layer
+            if norm_eps is not None:
+                norm_layer_cl = partial(norm_layer_cl, eps=norm_eps)
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        self.feature_info = []
+        assert stem_type in ("patch", "overlap", "overlap_tiered")
+        if stem_type == "patch":
+            # NOTE: this stem is a minimal form of ViT PatchEmbed, as used in SwinTransformer w/ patch_size = 4
+            self.stem = nn.Sequential(
+                nn.Conv2d(
+                    in_chans,
+                    dims[0],
+                    kernel_size=patch_size,
+                    stride=patch_size,
+                    bias=conv_bias,
+                ),
+                norm_layer(dims[0]),
+            )
+            stem_stride = patch_size
+        else:
+            mid_chs = make_divisible(dims[0] // 2) if "tiered" in stem_type else dims[0]
+            self.stem = nn.Sequential(
+                nn.Conv2d(
+                    in_chans,
+                    mid_chs,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    bias=conv_bias,
+                ),
+                nn.Conv2d(
+                    mid_chs, dims[0], kernel_size=3, stride=2, padding=1, bias=conv_bias
+                ),
+                norm_layer(dims[0]),
+            )
+            stem_stride = 4
+        self.stages = nn.Sequential()
+        dp_rates = [
+            x.tolist()
+            for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)
+        ]
+        stages = []
+        prev_chs = dims[0]
+        curr_stride = stem_stride
+        dilation = 1
+        # 4 feature resolution stages, each consisting of multiple residual blocks
+        for i in range(4):
+            stride = 2 if curr_stride == 2 or i > 0 else 1
+            if curr_stride >= output_stride and stride > 1:
+                dilation *= stride
+                stride = 1
+            curr_stride *= stride
+            first_dilation = 1 if dilation in (1, 2) else 2
+            out_chs = dims[i]
+            stages.append(
+                ConvNeXtStage(
+                    prev_chs,
+                    out_chs,
+                    kernel_size=kernel_sizes[i],
+                    stride=stride,
+                    dilation=(first_dilation, dilation),
+                    depth=depths[i],
+                    drop_path_rates=dp_rates[i],
+                    ls_init_value=ls_init_value,
+                    conv_mlp=conv_mlp,
+                    conv_bias=conv_bias,
+                    use_grn=use_grn,
+                    act_layer=act_layer,
+                    norm_layer=norm_layer,
+                    norm_layer_cl=norm_layer_cl,
+                )
+            )
+            prev_chs = out_chs
+            # NOTE feature_info use currently assumes stage 0 == stride 1, rest are stride 2
+            self.feature_info += [
+                dict(num_chs=prev_chs, reduction=curr_stride, module=f"stages.{i}")
+            ]
+        self.stages = nn.Sequential(*stages)
+        self.num_features = self.head_hidden_size = prev_chs
+        # if head_norm_first == true, norm -> global pool -> fc ordering, like most other nets
+        # otherwise pool -> norm -> fc, the default ConvNeXt ordering (pretrained FB weights)
+        if head_norm_first:
+            assert not head_hidden_size
+            self.norm_pre = norm_layer(self.num_features)
+            self.head = ClassifierHead(
+                self.num_features,
+                num_classes,
+                pool_type=global_pool,
+                drop_rate=self.drop_rate,
+            )
+        else:
+            self.norm_pre = nn.Identity()
+            self.head = NormMlpClassifierHead(
+                self.num_features,
+                num_classes,
+                hidden_size=head_hidden_size,
+                pool_type=global_pool,
+                drop_rate=self.drop_rate,
+                norm_layer=norm_layer,
+                act_layer="gelu",
+            )
+            self.head_hidden_size = self.head.num_features
+        named_apply(partial(_init_weights, head_init_scale=head_init_scale), self)
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r"^stem",
+            blocks=(
+                r"^stages\.(\d+)"
+                if coarse
+                else [
+                    (r"^stages\.(\d+)\.downsample", (0,)),  # blocks
+                    (r"^stages\.(\d+)\.blocks\.(\d+)", None),
+                    (r"^norm_pre", (99999,)),
+                ]
+            ),
+        )
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        for s in self.stages:
+            s.grad_checkpointing = enable
+    @torch.jit.ignore
+    def get_classifier(self) -> nn.Module:
+        return self.head.fc
+    def reset_classifier(self, num_classes: int, global_pool: Optional[str] = None):
+        self.num_classes = num_classes
+        self.head.reset(num_classes, global_pool)
+    def forward_intermediates(
+        self,
+        x: torch.Tensor,
+        indices: Optional[Union[int, List[int], Tuple[int]]] = None,
+        norm: bool = False,
+        stop_early: bool = False,
+        output_fmt: str = "NCHW",
+        intermediates_only: bool = False,
+    ) -> Union[List[torch.Tensor], Tuple[torch.Tensor, List[torch.Tensor]]]:
+        """Forward features that returns intermediates.
+        Args:
+            x: Input image tensor
+            indices: Take last n blocks if int, all if None, select matching indices if sequence
+            norm: Apply norm layer to compatible intermediates
+            stop_early: Stop iterating over blocks when last desired intermediate hit
+            output_fmt: Shape of intermediate feature outputs
+            intermediates_only: Only return intermediate features
+        Returns:
+        """
+        assert output_fmt in ("NCHW",), "Output shape must be NCHW."
+        intermediates = []
+        take_indices, max_index = feature_take_indices(len(self.stages) + 1, indices)
+        # forward pass
+        feat_idx = 0  # stem is index 0
+        x = self.stem(x)
+        if feat_idx in take_indices:
+            intermediates.append(x)
+        if (
+            torch.jit.is_scripting() or not stop_early
+        ):  # can't slice blocks in torchscript
+            stages = self.stages
+        else:
+            stages = self.stages[:max_index]
+        for stage in stages:
+            feat_idx += 1
+            x = stage(x)
+            if feat_idx in take_indices:
+                # NOTE not bothering to apply norm_pre when norm=True as almost no models have it enabled
+                intermediates.append(x)
+        if intermediates_only:
+            return intermediates
+        x = self.norm_pre(x)
+        return x, intermediates
+    def prune_intermediate_layers(
+        self,
+        indices: Union[int, List[int], Tuple[int]] = 1,
+        prune_norm: bool = False,
+        prune_head: bool = True,
+    ):
+        """Prune layers not required for specified intermediates."""
+        take_indices, max_index = feature_take_indices(len(self.stages) + 1, indices)
+        self.stages = self.stages[:max_index]  # truncate blocks w/ stem as idx 0
+        if prune_norm:
+            self.norm_pre = nn.Identity()
+        if prune_head:
+            self.reset_classifier(0, "")
+        return take_indices
+    def forward_features(self, x):
+        x = self.stem(x)
+        x = self.stages(x)
+        x = self.norm_pre(x)
+        return x
+    def forward_head(self, x, pre_logits: bool = False):
+        return self.head(x, pre_logits=True) if pre_logits else self.head(x)
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+def _init_weights(module, name=None, head_init_scale=1.0):
+    if isinstance(module, nn.Conv2d):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        nn.init.zeros_(module.bias)
+        if name and "head." in name:
+            module.weight.data.mul_(head_init_scale)
+            module.bias.data.mul_(head_init_scale)
+def checkpoint_filter_fn(state_dict, model):
+    """Remap FB checkpoints -> timm"""
+    if "head.norm.weight" in state_dict or "norm_pre.weight" in state_dict:
+        return state_dict  # non-FB checkpoint
+    if "model" in state_dict:
+        state_dict = state_dict["model"]
+    out_dict = {}
+    if "visual.trunk.stem.0.weight" in state_dict:
+        out_dict = {
+            k.replace("visual.trunk.", ""): v
+            for k, v in state_dict.items()
+            if k.startswith("visual.trunk.")
+        }
+        if "visual.head.proj.weight" in state_dict:
+            out_dict["head.fc.weight"] = state_dict["visual.head.proj.weight"]
+            out_dict["head.fc.bias"] = torch.zeros(
+                state_dict["visual.head.proj.weight"].shape[0]
+            )
+        elif "visual.head.mlp.fc1.weight" in state_dict:
+            out_dict["head.pre_logits.fc.weight"] = state_dict[
+                "visual.head.mlp.fc1.weight"
+            ]
+            out_dict["head.pre_logits.fc.bias"] = state_dict["visual.head.mlp.fc1.bias"]
+            out_dict["head.fc.weight"] = state_dict["visual.head.mlp.fc2.weight"]
+            out_dict["head.fc.bias"] = torch.zeros(
+                state_dict["visual.head.mlp.fc2.weight"].shape[0]
+            )
+        return out_dict
+    import re
+    for k, v in state_dict.items():
+        k = k.replace("downsample_layers.0.", "stem.")
+        k = re.sub(r"stages.([0-9]+).([0-9]+)", r"stages.\1.blocks.\2", k)
+        k = re.sub(
+            r"downsample_layers.([0-9]+).([0-9]+)", r"stages.\1.downsample.\2", k
+        )
+        k = k.replace("dwconv", "conv_dw")
+        k = k.replace("pwconv", "mlp.fc")
+        if "grn" in k:
+            k = k.replace("grn.beta", "mlp.grn.bias")
+            k = k.replace("grn.ramma", "mlp.grn.weight")
+            v = v.reshape(v.shape[-1])
+        k = k.replace("head.", "head.fc.")
+        if k.startswith("norm."):
+            k = k.replace("norm", "head.norm")
+        if v.ndim == 2 and "head" not in k:
+            model_shape = model.state_dict()[k].shape
+            v = v.reshape(model_shape)
+        out_dict[k] = v
+    return out_dict
+def _create_convnext(variant, pretrained=False, **kwargs):
+    if kwargs.get("pretrained_cfg", "") == "fcmae":
+        # NOTE fcmae pretrained weights have no classifier or final norm-layer (`head.norm`)
+        # This is workaround loading with num_classes=0 w/o removing norm-layer.
+        kwargs.setdefault("pretrained_strict", False)
+    model = build_model_with_cfg(
+        ConvNeXt,
+        variant,
+        pretrained,
+        pretrained_filter_fn=checkpoint_filter_fn,
+        feature_cfg=dict(out_indices=(0, 1, 2, 3), flatten_sequential=True),
+        **kwargs,
+    )
+    return model
+def convnext_large(pretrained=False, **kwargs) -> ConvNeXt:
+    model_args = dict(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536])
+    model = _create_convnext(
+        "convnext_large", pretrained=pretrained, **dict(model_args, **kwargs)
+    )
+    return model
+class CLIP(nn.Module):
+    output_dict: torch.jit.Final[bool]
+    def __init__(
+        self,
+        embed_dim: int,
+        vision_cfg: CLIPVisionCfg,
+        quick_gelu: bool = False,
+        cast_dtype: Optional[torch.dtype] = None,
+        output_dict: bool = False,
+        **kwargs,
+    ):
+        super().__init__()
+        self.output_dict = output_dict
+        self.visual = convnext_large()
+class ConvNextVisionEncoder(nn.Module):
+    def __init__(
+        self,
+    ):
+        super().__init__()
+        self.model_type = "convnext_large_d_320"
+        self.model_channel = [192, 384, 768, 1536]  # stage 0-3
+        clip_model = CLIP(**get_model_config(self.model_type), use_text=False)
+        # decompose stem and stages blocks in vision tower
+        self.vision_stem = clip_model.visual.stem
+        self.vision_stages = clip_model.visual.stages
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_feature = self.backbone(
+                    image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
+                )
+                image_features.append(image_feature)
+        else:
+            image_features = self.backbone(
+                images.to(device=self.device, dtype=self.dtype),
+            )
+        return {
+            "image_features": image_features,
+            "last_feat": image_features[-1],
+        }
+    def backbone(self, images: torch.Tensor) -> Tuple[List[torch.Tensor], List[int]]:
+        """Process the input images through the backbone network.
+        Inputs:
+            images (torch.Tensor): The input images.
+        Returns:
+            Tuple[List[torch.Tensor], List[int]]: A tuple containing a list of feature maps and a
+                ist of channels per level.
+        """
+        with torch.no_grad():
+            results = self.basic_forward(images)
+        feature_maps = []
+        for _stage in results:
+            feature_maps.append(results[_stage].contiguous())
+        return feature_maps
+    def basic_forward(self, images):
+        results = {}
+        x = self.vision_stem(images)
+        for _idx in range(len(self.vision_stages)):
+            x = self.vision_stages[_idx](x)
+            results[f"stage_{_idx}"] = x
+        return results
+    @property
+    def dtype(self):
+        return self.vision_stem[0].weight.dtype
+    @property
+    def device(self):
+        return self.vision_stem[0].weight.device
+    @property
+    def config(self):
+        return self.vision_config
+    @property
+    def hidden_size(self):
+        return sum(self.model_channel)
+if __name__ == "__main__":
+    model = ConvNextVisionEncoder()
+    print(model.state_dict().keys())

generation_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "attn_implementation": "flash_attention_2",
+  "bos_token_id": 151643,
+  "delay_load": false,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.05,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "4.48.0"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a9eaba82827e894601eb5fe8338dd1c9b146ab749ab07287950b9069823743d1
+size 4956876272

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab2bf0f72bd76bb22ec2e430f012067f16faeef970b43ce64abefc3777fcb1b4
+size 2874661528

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_rexseek.py ADDED Viewed

	@@ -0,0 +1,666 @@

+import logging
+import math
+import os
+import re
+from typing import List, Optional, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import nn
+from torchvision.ops import roi_align
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    AutoModelForCausalLM,
+    Qwen2Config,
+    Qwen2ForCausalLM,
+    StoppingCriteria,
+    StoppingCriteriaList,
+)
+from transformers.generation.utils import GenerateOutput
+from transformers.utils import logging, strtobool
+from .clip import CLIPVisionTower
+from .convnext import ConvNextVisionEncoder
+logger = logging.get_logger(__name__)
+XLA_USE_BF16 = os.environ.get("XLA_USE_BF16", "0").upper()
+XLA_DOWNCAST_BF16 = os.environ.get("XLA_DOWNCAST_BF16", "0").upper()
+IGNORE_INDEX = -100
+DEFAULT_PAD_TOKEN_INDEX = 0
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+# For Objects
+DEFAULT_OBJECT_TOKEN = "<obj<i>>"
+DEFAULT_OBJECT_FEATURE_TOKEN = "<objfeat>"
+DEFAULT_OBJECT_INDEX = -300
+# For Grounding
+DEFAULT_GROUNDING_START = "<ground>"
+DEFAULT_GROUNDING_END = "</ground>"
+DEFAULT_GROUNDING_OBJECTS_START = "<objects>"
+DEFAULT_GROUNDING_OBJECTS_END = "</objects>"
+def is_fsdp_enabled():
+    return (
+        torch.distributed.is_available()
+        and torch.distributed.is_initialized()
+        and strtobool(os.environ.get("ACCELERATE_USE_FSDP", "False")) == 1
+        and strtobool(os.environ.get("FSDP_CPU_RAM_EFFICIENT_LOADING", "False")) == 1
+    )
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, *args, **kwargs):
+        return x
+    @property
+    def config(self):
+        return {"mm_projector_type": "identity"}
+class SimpleResBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(channels)
+        self.proj = nn.Sequential(
+            nn.Linear(channels, channels), nn.GELU(), nn.Linear(channels, channels)
+        )
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+def build_vision_projector(config, start_hidden_size, delay_load=False, **kwargs):
+    projector_type = "mlp2x_gelu"
+    mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", projector_type)
+    if mlp_gelu_match:
+        mlp_depth = int(mlp_gelu_match.group(1))
+        modules = [nn.Linear(start_hidden_size, config.hidden_size)]
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+        return nn.Sequential(*modules)
+    if projector_type == "identity":
+        return IdentityMap()
+    raise ValueError(f"Unknown projector type: {projector_type}")
+def get_token_slices(input_ids: torch.Tensor):
+    """
+    Get slices of tokens based on special markers in the input tensor.
+    Args:
+        input_ids (torch.Tensor): A tensor of token IDs where IMAGE_TOKEN_INDEX represents an image token,
+            DEFAULT_OBJECT_INDEX represents an object token, and all other values represent text tokens.
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries where each dictionary contains the type of the
+            token slice ('text', 'image', 'object') and the span as a list of start and end indices.
+    """
+    # define type markers and corresponding types
+    type_map = {IMAGE_TOKEN_INDEX: "image", DEFAULT_OBJECT_INDEX: "object"}
+    # find the positions of special markers
+    image_indices = torch.where(input_ids == IMAGE_TOKEN_INDEX)[0]
+    object_indices = torch.where(input_ids == DEFAULT_OBJECT_INDEX)[0]
+    if len(object_indices) > 0:
+        has_object = True
+    else:
+        has_object = False
+    # merge all the positions of special markers
+    special_indices = torch.cat((image_indices, object_indices))
+    special_indices, _ = torch.sort(special_indices)
+    special_tokens = input_ids[special_indices]
+    slices = []
+    start_idx = 0
+    for i, idx in enumerate(special_indices):
+        if start_idx < idx:
+            slices.append({"type": "text", "span": [start_idx, idx.item()]})
+        token_type = type_map[special_tokens[i].item()]
+        slices.append({"type": token_type, "span": [idx.item(), idx.item() + 1]})
+        start_idx = idx.item() + 1
+    if start_idx < len(input_ids):
+        slices.append({"type": "text", "span": [start_idx, len(input_ids)]})
+    return slices, has_object
+class StopWordStoppingCriteria(StoppingCriteria):
+    """StopWord stopping criteria."""
+    def __init__(self, tokenizer, stop_word):
+        self.tokenizer = tokenizer
+        self.stop_word = stop_word
+        self.length = len(self.stop_word)
+    def __call__(self, input_ids, *args, **kwargs) -> bool:
+        cur_text = self.tokenizer.decode(input_ids[0])
+        cur_text = cur_text.replace("\r", "").replace("\n", "")
+        return cur_text[-self.length :] == self.stop_word
+def get_stop_criteria(
+    tokenizer,
+    stop_words=[],
+):
+    stop_criteria = StoppingCriteriaList()
+    for word in stop_words:
+        stop_criteria.append(StopWordStoppingCriteria(tokenizer, word))
+    return stop_criteria
+def gen_sineembed_for_position(pos_tensor, dim_of_pos_feats):
+    """Generate sine position embedding from a position tensor.
+    Args:
+        pos_tensor (torch.Tensor): shape: [batch_size, N, 4]. the last dimension is [cx, cy, w, h] in
+            normalized coordinates in range [0, 1].
+        out_dim (int): the output dimension of the position embedding.
+    Returns:
+        pos (torch.Tensor): shape: [batch_size, N, out_dim].
+    """
+    scale = 2 * math.pi
+    dim_t = torch.arange(
+        dim_of_pos_feats, dtype=torch.float32, device=pos_tensor.device
+    )
+    dim_t = 10000 ** (2 * (dim_t // 2) / dim_of_pos_feats)
+    x_embed = pos_tensor[:, :, 0] * scale
+    y_embed = pos_tensor[:, :, 1] * scale
+    pos_x = x_embed[:, :, None] / dim_t
+    pos_y = y_embed[:, :, None] / dim_t
+    pos_x = torch.stack(
+        (pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3
+    ).flatten(2)
+    pos_y = torch.stack(
+        (pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3
+    ).flatten(2)
+    if pos_tensor.size(-1) == 2:
+        pos = torch.cat((pos_y, pos_x), dim=2)
+    elif pos_tensor.size(-1) == 4:
+        w_embed = pos_tensor[:, :, 2] * scale
+        pos_w = w_embed[:, :, None] / dim_t
+        pos_w = torch.stack(
+            (pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3
+        ).flatten(2)
+        h_embed = pos_tensor[:, :, 3] * scale
+        pos_h = h_embed[:, :, None] / dim_t
+        pos_h = torch.stack(
+            (pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3
+        ).flatten(2)
+        pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
+    else:
+        raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1)))
+    return pos
+class MultiLevelROIVisualPrompt(nn.Module):
+    """Initialize the MultiLevelROIVisualPrompt.
+    Args:
+        output_size (Optional[int]): The size of the output. Default is None.
+        channel_per_level (List[int]): List of channels per level. Default is [192, 384, 768, 1536].
+        spatial_scale (Optional[float]): The spatial scale factor. Default is None.
+        with_additional_projection (bool): Whether to use additional projection. Default is False.
+        visual_prompt_hidden_size (int): The hidden size of the visual prompt. Default is 1024.
+        add_pos_embedding (bool): Whether to add position embedding. Default is False.
+        pos_embedding_dim (int): The dimension of the position embedding. Default is 1024.
+    """
+    def __init__(
+        self,
+        output_size: int = None,
+        channel_per_level: List[int] = [192, 384, 768, 1536],
+        spatail_scale: float = None,
+        add_pos_embedding: bool = False,
+        pos_embedding_dim: int = 1024,
+    ):
+        super(MultiLevelROIVisualPrompt, self).__init__()
+        self.output_size = output_size
+        self.channel_per_level = channel_per_level
+        self.spatail_scale = spatail_scale
+        self.add_pos_embedding = add_pos_embedding
+        self.pos_embedding_dim = pos_embedding_dim
+    def __call__(
+        self,
+        multi_level_features: List[torch.Tensor],
+        boxes: Union[torch.Tensor, List[torch.Tensor]],
+    ) -> torch.Tensor:
+        """Performs Region of Interest (RoI) Align operator on multi-level features. The RoI
+        feature on each scale will go through a different linear layer for projection. Different
+        RoI features will be summed up and then average pooled.
+        Args:
+            multi_level_features (Listp[Tensor[N, C, H, W]]): Feature maps from different levels
+            boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2)
+                format where the regions will be taken from.
+        Returns:
+            Tensor[1, K, C]: The output tensor that has the shape KxC, where K is the number of RoIs
+        """
+        boxes[0] = boxes[0].float()
+        concat_multi_level_feature = []
+        max_height = max([feature.shape[2] for feature in multi_level_features])
+        max_width = max([feature.shape[3] for feature in multi_level_features])
+        # interpolate to the same size
+        for level, feature in enumerate(multi_level_features):
+            if level != 0:
+                concat_multi_level_feature.append(
+                    F.interpolate(
+                        feature.float(),
+                        size=(max_height, max_width),
+                        mode="bilinear",
+                        align_corners=False,
+                    )
+                )
+            else:
+                concat_multi_level_feature.append(feature.float())
+        concat_multi_level_feature = torch.cat(concat_multi_level_feature, dim=1)
+        out_box_feat = roi_align(
+            concat_multi_level_feature,
+            boxes,
+            output_size=self.output_size,
+            spatial_scale=self.spatail_scale,
+        )
+        # Average Pooling -> n,c -> 1,n,c
+        out_box_feat = out_box_feat.mean(dim=(2, 3)).reshape(
+            1, out_box_feat.shape[0], out_box_feat.shape[1]
+        )
+        if self.add_pos_embedding:
+            # note that this boxes is in xyxy, unormalized format, so we need to normalize it first
+            boxes = boxes[0]  # (N, 4)
+            boxes = boxes.to(out_box_feat.dtype)
+            original_img_width = max_width / self.spatail_scale
+            original_img_height = max_height / self.spatail_scale
+            boxes[:, [0, 2]] = boxes[:, [0, 2]] / original_img_width
+            boxes[:, [1, 3]] = boxes[:, [1, 3]] / original_img_height
+            # convert from xyxy to cx, cy, w, h
+            boxes[:, 2] = boxes[:, 2] - boxes[:, 0]
+            boxes[:, 3] = boxes[:, 3] - boxes[:, 1]
+            boxes[:, 0] = boxes[:, 0] + boxes[:, 2] / 2
+            boxes[:, 1] = boxes[:, 1] + boxes[:, 3] / 2
+            pos_embed = gen_sineembed_for_position(
+                boxes.unsqueeze(0), self.pos_embedding_dim // 4
+            )
+            out_box_feat = out_box_feat + pos_embed
+        return out_box_feat
+class RexSeekQwenConfig(Qwen2Config):
+    model_type = "rexseek_qwen"
+class RexSeekQwenForCausalLM(Qwen2ForCausalLM):
+    config_class = RexSeekQwenConfig
+    def __init__(self, config):
+        super().__init__(config)
+        # low resolusion vision encoder
+        vision_tower = getattr(
+            config,
+            "mm_vision_tower",
+            getattr(config, "vision_tower", None),
+        )
+        self.vision_tower = CLIPVisionTower(
+            vision_tower,
+            args=config,
+        )
+        # high resolusion vision encoder
+        self.vision_tower_aux = ConvNextVisionEncoder()
+        # vision projector
+        self.mm_projector = build_vision_projector(
+            config, start_hidden_size=2560
+        )  # projector for vision_tower
+        # projector for object token
+        self.mm_object_projector = build_vision_projector(
+            config, start_hidden_size=2880
+        )
+        # visual prompt encoder
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.box_encoder = MultiLevelROIVisualPrompt(
+            output_size=7,
+            channel_per_level=[192, 384, 768, 1536],  # ConvNeXt Large
+            spatail_scale=192 / 768,
+            add_pos_embedding=True,
+            pos_embedding_dim=2880,
+        )
+        self.post_init()
+        print("model initialized")
+    def get_vision_tower(self):
+        vision_tower = getattr(self, "vision_tower", None)
+        if type(vision_tower) is list:
+            vision_tower = vision_tower[0]
+        return vision_tower
+    def get_vision_tower_aux(self):
+        vision_tower_aux = getattr(self, "vision_tower_aux", None)
+        if type(vision_tower_aux) is list:
+            vision_tower_aux = vision_tower_aux[0]
+        return vision_tower_aux
+    def get_model(self):
+        return self.model
+    def encode_images(self, images, images_aux):
+        low_res_feat = self.get_vision_tower()(images)
+        aux_output = self.get_vision_tower_aux()(images_aux)
+        visual_outputs_aux = aux_output["image_features"]
+        high_res_feat = aux_output["last_feat"]  # (B, 1536, 24, 24)
+        # concat the low res features with the high res features
+        b, c, h, w = high_res_feat.shape  # (2, 1536, 24, 24)
+        _, _, d = low_res_feat.shape  # (2, 576, 1024)
+        high_res_feat = high_res_feat.view(b, c, h * w).transpose(1, 2)
+        image_features = torch.cat((low_res_feat, high_res_feat), dim=-1)
+        image_features = self.mm_projector(image_features)
+        return image_features, visual_outputs_aux
+    def encode_objects(
+        self, bboxes, visual_outputs_aux, dtype, num_gt_boxes_per_image=None
+    ):
+        """Encode object features from bounding boxes.
+        Args:
+            bboxes (torch.Tensor): bounding boxes in the shape of (N, 4)
+            image_features_before_proj (torch.Tensor): image features in the shape of (N, hidden_size)
+        Returns:
+            torch.Tensor: object features in the shape of (N, hidden_size)
+        """
+        bbox_visual_outputs = []
+        for batch_idx, boxes in enumerate(bboxes):
+            num_box = (
+                num_gt_boxes_per_image[batch_idx]
+                if num_gt_boxes_per_image is not None
+                else len(boxes)
+            )
+            boxes = boxes[:num_box]
+            if len(boxes) == 0:
+                bbox_visual_outputs.append(None)
+                continue
+            multi_level_aux_features = [
+                visual_output_aux[batch_idx].unsqueeze(0)
+                for visual_output_aux in visual_outputs_aux
+            ]
+            out_vp_feat = self.box_encoder(
+                multi_level_aux_features,
+                [boxes],
+            ).squeeze(0)
+            out_vp_feat = out_vp_feat.to(dtype)
+            out_vp_feat = self.mm_object_projector(out_vp_feat)
+            bbox_visual_outputs.append(out_vp_feat)
+        # b,n,c
+        return bbox_visual_outputs
+    def prepare_inputs_labels_for_multimodal(
+        self,
+        input_ids,
+        position_ids,
+        attention_mask,
+        past_key_values,
+        labels,
+        pixel_values=None,
+        pixel_values_aux=None,
+        gt_boxes=None,
+        num_gt_boxes_per_image=None,
+    ):
+        if pixel_values is None:
+            return (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                None,
+                labels,
+            )
+        pixel_values, visual_outputs_aux = self.encode_images(
+            pixel_values, pixel_values_aux
+        )  # (B, 576, 2048)
+        if gt_boxes is not None:
+            bbox_feats = self.encode_objects(
+                gt_boxes, visual_outputs_aux, pixel_values.dtype, num_gt_boxes_per_image
+            )
+        _labels = labels
+        _position_ids = position_ids
+        _attention_mask = attention_mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        else:
+            attention_mask = attention_mask.bool()  # padding mask in shaoe (B, L)
+        if position_ids is None:
+            position_ids = torch.arange(
+                0, input_ids.shape[1], dtype=torch.long, device=input_ids.device
+            )
+        if labels is None:
+            labels = torch.full_like(input_ids, IGNORE_INDEX)
+        input_ids = [
+            cur_input_ids[cur_attention_mask]
+            for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)
+        ]
+        labels = [
+            cur_labels[cur_attention_mask]
+            for cur_labels, cur_attention_mask in zip(labels, attention_mask)
+        ]
+        new_input_embeds = []
+        new_labels = []
+        cur_image_idx = 0
+        cur_object_idx = 0
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+            if num_images == 0:
+                cur_image_features = pixel_values[cur_image_idx]
+                cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
+                cur_input_embeds = torch.cat(
+                    [cur_input_embeds_1, cur_image_features[0:0]], dim=0
+                )
+                new_input_embeds.append(cur_input_embeds)
+                new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                cur_object_idx += 1
+                continue
+            cur_labels = labels[batch_idx]
+            token_slices, has_object = get_token_slices(cur_input_ids)
+            result_input_embeddings = []
+            result_output_labels = []
+            cur_gt_bnox_indice = 0
+            cur_object_features = None
+            for slice in token_slices:
+                slice_type = slice["type"]
+                slice_span = slice["span"]
+                if slice_type == "text":
+                    cur_input_ids_noim = cur_input_ids[slice_span[0] : slice_span[1]]
+                    cur_labels_noim = cur_labels[slice_span[0] : slice_span[1]]
+                    cur_input_embeds = self.get_model().embed_tokens(cur_input_ids_noim)
+                    result_input_embeddings.append(cur_input_embeds)
+                    result_output_labels.append(cur_labels_noim)
+                elif slice_type == "image":
+                    cur_input_embeds = pixel_values[cur_image_idx]
+                    result_input_embeddings.append(cur_input_embeds)
+                    result_output_labels.append(
+                        torch.full(
+                            (cur_input_embeds.shape[0],),
+                            IGNORE_INDEX,
+                            device=cur_labels.device,
+                            dtype=cur_labels.dtype,
+                        )
+                    )
+                    cur_image_idx += 1
+                elif slice_type == "object":
+                    try:
+                        result_input_embeddings.append(
+                            bbox_feats[cur_object_idx][cur_gt_bnox_indice].unsqueeze(0)
+                        )
+                    except:
+                        raise ValueError(
+                            f"current boxe_feats.shape: {bbox_feats[cur_object_idx].shape}, "
+                        )
+                    cur_gt_bnox_indice += 1
+                    result_output_labels.append(
+                        torch.full(
+                            (1,),
+                            IGNORE_INDEX,
+                            device=cur_labels.device,
+                            dtype=cur_labels.dtype,
+                        )
+                    )
+            cur_object_idx += 1
+            result_input_embeddings = torch.cat(result_input_embeddings)
+            result_output_labels = torch.cat(result_output_labels)
+            assert len(result_output_labels) == len(result_input_embeddings)
+            new_input_embeds.append(result_input_embeddings)
+            new_labels.append(result_output_labels)
+        # Truncate sequences to max length as image embeddings can make the sequence longer
+        tokenizer_model_max_length = getattr(
+            self.config, "tokenizer_model_max_length", None
+        )
+        if tokenizer_model_max_length is not None:
+            new_input_embeds = [
+                x[:tokenizer_model_max_length] for x in new_input_embeds
+            ]
+            new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
+        # Combine them
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        batch_size = len(new_input_embeds)
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full(
+            (batch_size, max_len),
+            IGNORE_INDEX,
+            dtype=new_labels[0].dtype,
+            device=new_labels[0].device,
+        )
+        attention_mask = torch.zeros(
+            (batch_size, max_len),
+            dtype=attention_mask.dtype,
+            device=attention_mask.device,
+        )
+        position_ids = torch.zeros(
+            (batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device
+        )
+        for i, (cur_new_embed, cur_new_labels) in enumerate(
+            zip(new_input_embeds, new_labels)
+        ):
+            cur_len = cur_new_embed.shape[0]
+            new_input_embeds_padded.append(
+                torch.cat(
+                    (
+                        cur_new_embed,
+                        torch.zeros(
+                            (max_len - cur_len, cur_new_embed.shape[1]),
+                            dtype=cur_new_embed.dtype,
+                            device=cur_new_embed.device,
+                        ),
+                    ),
+                    dim=0,
+                )
+            )
+            if cur_len > 0:
+                new_labels_padded[i, :cur_len] = cur_new_labels
+                attention_mask[i, :cur_len] = True
+                position_ids[i, :cur_len] = torch.arange(
+                    0, cur_len, dtype=position_ids.dtype, device=position_ids.device
+                )
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+        if _labels is None:
+            new_labels = None
+        else:
+            new_labels = new_labels_padded
+        if _attention_mask is None:
+            attention_mask = None
+        else:
+            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+        if _position_ids is None:
+            position_ids = None
+        return (
+            None,
+            position_ids,
+            attention_mask,
+            past_key_values,
+            new_input_embeds,
+            new_labels,
+        )
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor],
+        pixel_values: Optional[torch.Tensor],
+        pixel_values_aux: Optional[torch.Tensor],
+        position_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        if inputs_embeds is None:
+            position_ids = kwargs.pop("position_ids", None)
+            attention_mask = kwargs.pop("attention_mask", None)
+            gt_boxes = kwargs.pop("gt_boxes", None)
+            num_gt_boxes_per_image = kwargs.pop("num_gt_boxes_per_image", None)
+            if pixel_values is not None:
+                (inputs, position_ids, attention_mask, _, inputs_embeds, _) = (
+                    self.prepare_inputs_labels_for_multimodal(
+                        inputs,
+                        position_ids,
+                        attention_mask,
+                        past_key_values=None,
+                        labels=None,
+                        pixel_values=pixel_values,
+                        pixel_values_aux=pixel_values_aux,
+                        gt_boxes=gt_boxes,
+                        num_gt_boxes_per_image=num_gt_boxes_per_image,
+                    )
+                )
+            else:
+                inputs_embeds = self.get_model().embed_tokens(inputs)
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
+AutoConfig.register("rexseek_qwen", RexSeekQwenConfig)
+AutoModelForCausalLM.register(RexSeekQwenConfig, RexSeekQwenForCausalLM)

preprocessing_rexseek.py ADDED Viewed

	@@ -0,0 +1,259 @@

+from PIL import Image
+import re
+from typing import List, Union
+import numpy as np
+import torch
+import torchvision.transforms.functional as F
+from transformers import AutoTokenizer
+from transformers.processing_utils import ProcessorMixin
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+IGNORE_INDEX = -100
+DEFAULT_PAD_TOKEN_INDEX = 0
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+# For Objects
+DEFAULT_OBJECT_TOKEN = "<obj<i>>"
+DEFAULT_OBJECT_FEATURE_TOKEN = "<objfeat>"
+DEFAULT_OBJECT_INDEX = -300
+# For Grounding
+DEFAULT_GROUNDING_START = "<ground>"
+DEFAULT_GROUNDING_END = "</ground>"
+DEFAULT_GROUNDING_OBJECTS_START = "<objects>"
+DEFAULT_GROUNDING_OBJECTS_END = "</objects>"
+def xyxy_to_xywh(boxes):
+    """
+    Convert boxes from xywh to xyxy format.
+    Parameters:
+    boxes (numpy.ndarray): An array of shape (N, 4) where N is the number of boxes.
+                           Each box is represented as [x, y, x, y].
+    Returns:
+    numpy.ndarray: An array of shape (N, 4) where each box is represented as [x_min, y_min, w, h].
+    """
+    boxes = np.array(boxes)
+    x_min, y_min, x_max, y_max = (
+        boxes[:, 0],
+        boxes[:, 1],
+        boxes[:, 2],
+        boxes[:, 3],
+    )
+    w = x_max - x_min
+    h = y_max - y_min
+    return np.stack([x_min, y_min, w, h], axis=1)
+def xywh_to_xyxy(boxes):
+    """
+    Convert boxes from xywh to xyxy format.
+    Parameters:
+    boxes (numpy.ndarray): An array of shape (N, 4) where N is the number of boxes.
+                           Each box is represented as [x, y, width, height].
+    Returns:
+    numpy.ndarray: An array of shape (N, 4) where each box is represented as [x_min, y_min, x_max, y_max].
+    """
+    boxes = np.array(boxes)
+    x, y, width, height = (
+        boxes[:, 0],
+        boxes[:, 1],
+        boxes[:, 2],
+        boxes[:, 3],
+    )
+    x_max = x + width
+    y_max = y + height
+    return np.stack([x, y, x_max, y_max], axis=1)
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+def pad_boxes(gt_boxes, old_size):
+    old_w, old_h = old_size
+    gt_boxes = np.array(gt_boxes).astype(np.float32)
+    # Calculate the padding added
+    if old_w > old_h:
+        pad_top = (old_w - old_h) // 2
+        pad_bottom = old_w - old_h - pad_top
+        pad_left, pad_right = 0, 0
+    else:
+        pad_left = (old_h - old_w) // 2
+        pad_right = old_h - old_w - pad_left
+        pad_top, pad_bottom = 0, 0
+    # Adjust the boxes for padding
+    gt_boxes[:, 0] += pad_left  # x
+    gt_boxes[:, 1] += pad_top  # y
+    return gt_boxes
+def resize_boxes(gt_boxes, old_size, new_size):
+    old_w, old_h = old_size
+    new_h, new_w = new_size
+    gt_boxes = np.array(gt_boxes).astype(np.float32)
+    # Calculate scale factors
+    scale_x = new_w / max(old_w, old_h)
+    scale_y = new_h / max(old_w, old_h)
+    # Resize the boxes
+    gt_boxes[:, 0] *= scale_x  # x
+    gt_boxes[:, 1] *= scale_y  # y
+    gt_boxes[:, 2] *= scale_x  # w
+    gt_boxes[:, 3] *= scale_y  # h
+    return gt_boxes
+def split_special_strings(input_string: str, special_strings: list[str] = None):
+    """Split the input string into a list of strings, keeping the special strings.
+    Args:
+        input_string (str): The input string to split.
+        Example:
+            input_string = "<image>\n<obj0><objfeat><obj1><objfeat>\n I am happy today."
+            output = ['<image>', '\n<obj0>', '<objfeat>', '<obj1>', '<objfeat>', '\n I am happy today.']
+    Returns:
+        list: A list of strings, with the special strings separated from the rest of the input string.
+    """
+    # Create a regex pattern to match the special strings
+    pattern = "|".join(map(re.escape, special_strings))
+    # Split the input string using the pattern, keeping the special strings in the result
+    split_list = re.split(f"({pattern})", input_string)
+    # Remove empty strings from the list
+    split_list = [s for s in split_list if s]
+    return split_list
+def tokenizer_image_object_token(prompt, tokenizer):
+    bos_token_id = tokenizer.bos_token_id
+    split_tokens = [DEFAULT_IMAGE_TOKEN, DEFAULT_OBJECT_FEATURE_TOKEN]
+    chunks = split_special_strings(prompt, split_tokens)
+    input_encode = [bos_token_id] if bos_token_id else []
+    for chunk in chunks:
+        if chunk == DEFAULT_IMAGE_TOKEN:
+            input_encode.append(IMAGE_TOKEN_INDEX)
+        elif chunk == DEFAULT_OBJECT_FEATURE_TOKEN:
+            input_encode.append(DEFAULT_OBJECT_INDEX)
+        else:
+            input_encode.extend(tokenizer.encode(chunk, add_special_tokens=False))
+    return input_encode
+class RexSeekProcessor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(self, image_processor=None, tokenizer: AutoTokenizer = None, **kwargs):
+        # self.image_processor = image_processor
+        # self.tokenizer = tokenizer
+        super().__init__(image_processor, tokenizer)
+        self._special_tokens = None
+        self.template = dict(
+            SYSTEM=("<|im_start|>system\n{system}<|im_end|>\n"),
+            INSTRUCTION=(
+                "<|im_start|>user\n{input}<|im_end|>\n" "<|im_start|>assistant\n"
+            ),
+            SUFFIX="<|im_end|>",
+            SUFFIX_AS_EOS=True,
+            SEP="\n",
+            STOP_WORDS=["<|im_end|>", "<|endoftext|>"],
+        )
+    def process(
+        self,
+        image: Union[str, Image.Image],
+        bbox: List[List[int]],
+        question: str,
+    ):
+        """Prepare input data for inference.
+        Args:
+            image (Union[str, Image.Image]): The image to process.
+            bbox (List[List[int]]): A list of bounding boxes for the image. Each bounding box should
+                be in order of [x, y, x , y].
+            question (str): The question to ask about the image.
+        """
+        data_dict = {}
+        # step1 load image
+        if type(image) == str:
+            image = Image.open(image).convert("RGB")
+        ori_w, ori_h = F.get_image_size(image)
+        image = expand2square(
+            image,
+            tuple(int(x * 255) for x in self.image_processor.image_mean),
+        )
+        pad_w, pad_h = F.get_image_size(image)
+        image_aux = self.image_processor.preprocess(image, return_tensors="pt")[
+            "pixel_values"
+        ][0]
+        resize_h, resize_w = image_aux.shape[-2:]
+        data_dict["pixel_values_aux"] = image_aux.unsqueeze(0)
+        image = image_aux.clone()
+        image = torch.nn.functional.interpolate(
+            image[None],
+            size=[336, 336],
+            mode="bilinear",
+            align_corners=False,
+        )[0]
+        data_dict["pixel_values"] = image.unsqueeze(0)
+        # step2 load boxes
+        bbox = xyxy_to_xywh(bbox)
+        bbox = pad_boxes(bbox, (ori_w, ori_h))
+        bbox = resize_boxes(bbox, (pad_w, pad_h), (resize_h, resize_w))
+        data_dict["gt_boxes"] = torch.tensor(xywh_to_xyxy(bbox)).unsqueeze(0)
+        # step3 prepare question
+        total_num_boxes = len(bbox)
+        obj_tokens = [
+            DEFAULT_OBJECT_TOKEN.replace("<i>", str(i)) for i in range(total_num_boxes)
+        ]
+        obj_tokens = (
+            DEFAULT_OBJECT_FEATURE_TOKEN.join(obj_tokens) + DEFAULT_OBJECT_FEATURE_TOKEN
+        )
+        question = question.replace(DEFAULT_IMAGE_TOKEN, "")
+        question = DEFAULT_IMAGE_TOKEN + "\n" + obj_tokens + "\n" + question
+        inputs = ""
+        inputs += self.template["INSTRUCTION"].format(input=question, round=1)
+        # step4 tokenize question
+        input_ids = tokenizer_image_object_token(inputs, self.tokenizer)
+        data_dict["input_ids"] = torch.tensor(input_ids).unsqueeze(0)
+        return data_dict
+RexSeekProcessor.register_for_auto_class()

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "crop_size": {
+    "height": 768,
+    "width": 768
+  },
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "CLIPImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "processor_class": "ChatRexProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": 768
+  }
+}

processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "auto_map": {
+    "AutoProcessor": "preprocessing_rexseek.RexSeekProcessor"
+  },
+  "processor_class": "RexSeekProcessor"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,128 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    "<obj0>",
+    "<obj1>",
+    "<obj2>",
+    "<obj3>",
+    "<obj4>",
+    "<obj5>",
+    "<obj6>",
+    "<obj7>",
+    "<obj8>",
+    "<obj9>",
+    "<obj10>",
+    "<obj11>",
+    "<obj12>",
+    "<obj13>",
+    "<obj14>",
+    "<obj15>",
+    "<obj16>",
+    "<obj17>",
+    "<obj18>",
+    "<obj19>",
+    "<obj20>",
+    "<obj21>",
+    "<obj22>",
+    "<obj23>",
+    "<obj24>",
+    "<obj25>",
+    "<obj26>",
+    "<obj27>",
+    "<obj28>",
+    "<obj29>",
+    "<obj30>",
+    "<obj31>",
+    "<obj32>",
+    "<obj33>",
+    "<obj34>",
+    "<obj35>",
+    "<obj36>",
+    "<obj37>",
+    "<obj38>",
+    "<obj39>",
+    "<obj40>",
+    "<obj41>",
+    "<obj42>",
+    "<obj43>",
+    "<obj44>",
+    "<obj45>",
+    "<obj46>",
+    "<obj47>",
+    "<obj48>",
+    "<obj49>",
+    "<obj50>",
+    "<obj51>",
+    "<obj52>",
+    "<obj53>",
+    "<obj54>",
+    "<obj55>",
+    "<obj56>",
+    "<obj57>",
+    "<obj58>",
+    "<obj59>",
+    "<obj60>",
+    "<obj61>",
+    "<obj62>",
+    "<obj63>",
+    "<obj64>",
+    "<obj65>",
+    "<obj66>",
+    "<obj67>",
+    "<obj68>",
+    "<obj69>",
+    "<obj70>",
+    "<obj71>",
+    "<obj72>",
+    "<obj73>",
+    "<obj74>",
+    "<obj75>",
+    "<obj76>",
+    "<obj77>",
+    "<obj78>",
+    "<obj79>",
+    "<obj80>",
+    "<obj81>",
+    "<obj82>",
+    "<obj83>",
+    "<obj84>",
+    "<obj85>",
+    "<obj86>",
+    "<obj87>",
+    "<obj88>",
+    "<obj89>",
+    "<obj90>",
+    "<obj91>",
+    "<obj92>",
+    "<obj93>",
+    "<obj94>",
+    "<obj95>",
+    "<obj96>",
+    "<obj97>",
+    "<obj98>",
+    "<obj99>",
+    "<ground>",
+    "</ground>",
+    "<objects>",
+    "</objects>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,1145 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<obj0>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151666": {
+      "content": "<obj1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151667": {
+      "content": "<obj2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151668": {
+      "content": "<obj3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151669": {
+      "content": "<obj4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<obj5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "<obj6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<obj7>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "<obj8>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<obj9>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151675": {
+      "content": "<obj10>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151676": {
+      "content": "<obj11>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151677": {
+      "content": "<obj12>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151678": {
+      "content": "<obj13>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151679": {
+      "content": "<obj14>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151680": {
+      "content": "<obj15>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151681": {
+      "content": "<obj16>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151682": {
+      "content": "<obj17>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151683": {
+      "content": "<obj18>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151684": {
+      "content": "<obj19>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151685": {
+      "content": "<obj20>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151686": {
+      "content": "<obj21>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151687": {
+      "content": "<obj22>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151688": {
+      "content": "<obj23>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151689": {
+      "content": "<obj24>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151690": {
+      "content": "<obj25>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151691": {
+      "content": "<obj26>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151692": {
+      "content": "<obj27>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151693": {
+      "content": "<obj28>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151694": {
+      "content": "<obj29>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151695": {
+      "content": "<obj30>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151696": {
+      "content": "<obj31>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151697": {
+      "content": "<obj32>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151698": {
+      "content": "<obj33>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151699": {
+      "content": "<obj34>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151700": {
+      "content": "<obj35>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151701": {
+      "content": "<obj36>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151702": {
+      "content": "<obj37>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151703": {
+      "content": "<obj38>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151704": {
+      "content": "<obj39>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151705": {
+      "content": "<obj40>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151706": {
+      "content": "<obj41>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151707": {
+      "content": "<obj42>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151708": {
+      "content": "<obj43>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151709": {
+      "content": "<obj44>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151710": {
+      "content": "<obj45>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151711": {
+      "content": "<obj46>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151712": {
+      "content": "<obj47>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151713": {
+      "content": "<obj48>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151714": {
+      "content": "<obj49>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151715": {
+      "content": "<obj50>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151716": {
+      "content": "<obj51>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151717": {
+      "content": "<obj52>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151718": {
+      "content": "<obj53>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151719": {
+      "content": "<obj54>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151720": {
+      "content": "<obj55>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151721": {
+      "content": "<obj56>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151722": {
+      "content": "<obj57>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151723": {
+      "content": "<obj58>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151724": {
+      "content": "<obj59>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151725": {
+      "content": "<obj60>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151726": {
+      "content": "<obj61>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151727": {
+      "content": "<obj62>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151728": {
+      "content": "<obj63>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151729": {
+      "content": "<obj64>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151730": {
+      "content": "<obj65>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151731": {
+      "content": "<obj66>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151732": {
+      "content": "<obj67>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151733": {
+      "content": "<obj68>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151734": {
+      "content": "<obj69>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151735": {
+      "content": "<obj70>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151736": {
+      "content": "<obj71>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151737": {
+      "content": "<obj72>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151738": {
+      "content": "<obj73>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151739": {
+      "content": "<obj74>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151740": {
+      "content": "<obj75>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151741": {
+      "content": "<obj76>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151742": {
+      "content": "<obj77>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151743": {
+      "content": "<obj78>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151744": {
+      "content": "<obj79>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151745": {
+      "content": "<obj80>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151746": {
+      "content": "<obj81>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151747": {
+      "content": "<obj82>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151748": {
+      "content": "<obj83>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151749": {
+      "content": "<obj84>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151750": {
+      "content": "<obj85>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151751": {
+      "content": "<obj86>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151752": {
+      "content": "<obj87>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151753": {
+      "content": "<obj88>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151754": {
+      "content": "<obj89>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151755": {
+      "content": "<obj90>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151756": {
+      "content": "<obj91>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151757": {
+      "content": "<obj92>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151758": {
+      "content": "<obj93>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151759": {
+      "content": "<obj94>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151760": {
+      "content": "<obj95>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151761": {
+      "content": "<obj96>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151762": {
+      "content": "<obj97>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151763": {
+      "content": "<obj98>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151764": {
+      "content": "<obj99>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151765": {
+      "content": "<ground>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151766": {
+      "content": "</ground>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151767": {
+      "content": "<objects>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151768": {
+      "content": "</objects>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    "<obj0>",
+    "<obj1>",
+    "<obj2>",
+    "<obj3>",
+    "<obj4>",
+    "<obj5>",
+    "<obj6>",
+    "<obj7>",
+    "<obj8>",
+    "<obj9>",
+    "<obj10>",
+    "<obj11>",
+    "<obj12>",
+    "<obj13>",
+    "<obj14>",
+    "<obj15>",
+    "<obj16>",
+    "<obj17>",
+    "<obj18>",
+    "<obj19>",
+    "<obj20>",
+    "<obj21>",
+    "<obj22>",
+    "<obj23>",
+    "<obj24>",
+    "<obj25>",
+    "<obj26>",
+    "<obj27>",
+    "<obj28>",
+    "<obj29>",
+    "<obj30>",
+    "<obj31>",
+    "<obj32>",
+    "<obj33>",
+    "<obj34>",
+    "<obj35>",
+    "<obj36>",
+    "<obj37>",
+    "<obj38>",
+    "<obj39>",
+    "<obj40>",
+    "<obj41>",
+    "<obj42>",
+    "<obj43>",
+    "<obj44>",
+    "<obj45>",
+    "<obj46>",
+    "<obj47>",
+    "<obj48>",
+    "<obj49>",
+    "<obj50>",
+    "<obj51>",
+    "<obj52>",
+    "<obj53>",
+    "<obj54>",
+    "<obj55>",
+    "<obj56>",
+    "<obj57>",
+    "<obj58>",
+    "<obj59>",
+    "<obj60>",
+    "<obj61>",
+    "<obj62>",
+    "<obj63>",
+    "<obj64>",
+    "<obj65>",
+    "<obj66>",
+    "<obj67>",
+    "<obj68>",
+    "<obj69>",
+    "<obj70>",
+    "<obj71>",
+    "<obj72>",
+    "<obj73>",
+    "<obj74>",
+    "<obj75>",
+    "<obj76>",
+    "<obj77>",
+    "<obj78>",
+    "<obj79>",
+    "<obj80>",
+    "<obj81>",
+    "<obj82>",
+    "<obj83>",
+    "<obj84>",
+    "<obj85>",
+    "<obj86>",
+    "<obj87>",
+    "<obj88>",
+    "<obj89>",
+    "<obj90>",
+    "<obj91>",
+    "<obj92>",
+    "<obj93>",
+    "<obj94>",
+    "<obj95>",
+    "<obj96>",
+    "<obj97>",
+    "<obj98>",
+    "<obj99>",
+    "<ground>",
+    "</ground>",
+    "<objects>",
+    "</objects>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 2048,
+  "pad_token": null,
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff