diff --git a/eval_mm_niah_ring_attn_256/configuration_intern_vit.py b/eval_mm_niah_ring_attn_256/configuration_intern_vit.py deleted file mode 100644 index ac60112c79abc35627a5b6b58e760c2f78e71839..0000000000000000000000000000000000000000 --- a/eval_mm_niah_ring_attn_256/configuration_intern_vit.py +++ /dev/null @@ -1,119 +0,0 @@ -# -------------------------------------------------------- -# InternVL -# Copyright (c) 2024 OpenGVLab -# Licensed under The MIT License [see LICENSE for details] -# -------------------------------------------------------- -import os -from typing import Union - -from transformers.configuration_utils import PretrainedConfig -from transformers.utils import logging - -logger = logging.get_logger(__name__) - - -class InternVisionConfig(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to - instantiate a vision encoder according to the specified arguments, defining the model architecture. - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - Args: - num_channels (`int`, *optional*, defaults to 3): - Number of color channels in the input images (e.g., 3 for RGB). - patch_size (`int`, *optional*, defaults to 14): - The size (resolution) of each patch. - image_size (`int`, *optional*, defaults to 224): - The size (resolution) of each image. - qkv_bias (`bool`, *optional*, defaults to `False`): - Whether to add a bias to the queries and values in the self-attention layers. - hidden_size (`int`, *optional*, defaults to 3200): - Dimensionality of the encoder layers and the pooler layer. - num_attention_heads (`int`, *optional*, defaults to 25): - Number of attention heads for each attention layer in the Transformer encoder. - intermediate_size (`int`, *optional*, defaults to 12800): - Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. - qk_normalization (`bool`, *optional*, defaults to `True`): - Whether to normalize the queries and keys in the self-attention layers. - num_hidden_layers (`int`, *optional*, defaults to 48): - Number of hidden layers in the Transformer encoder. - use_flash_attn (`bool`, *optional*, defaults to `True`): - Whether to use flash attention mechanism. - hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): - The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, - `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. - layer_norm_eps (`float`, *optional*, defaults to 1e-6): - The epsilon used by the layer normalization layers. - dropout (`float`, *optional*, defaults to 0.0): - The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. - drop_path_rate (`float`, *optional*, defaults to 0.0): - Dropout rate for stochastic depth. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - initializer_factor (`float`, *optional*, defaults to 0.1): - A factor for layer scale. - """ - - model_type = 'intern_vit_6b' - - def __init__( - self, - num_channels=3, - patch_size=14, - image_size=224, - qkv_bias=False, - hidden_size=3200, - num_attention_heads=25, - intermediate_size=12800, - qk_normalization=True, - num_hidden_layers=48, - use_flash_attn=True, - hidden_act='gelu', - norm_type='rms_norm', - layer_norm_eps=1e-6, - dropout=0.0, - drop_path_rate=0.0, - attention_dropout=0.0, - initializer_range=0.02, - initializer_factor=0.1, - **kwargs, - ): - super().__init__(**kwargs) - - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.dropout = dropout - self.drop_path_rate = drop_path_rate - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.initializer_range = initializer_range - self.initializer_factor = initializer_factor - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self.norm_type = norm_type - self.qkv_bias = qkv_bias - self.qk_normalization = qk_normalization - self.use_flash_attn = use_flash_attn - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig': - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if 'vision_config' in config_dict: - config_dict = config_dict['vision_config'] - - if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.' - ) - - return cls.from_dict(config_dict, **kwargs) diff --git a/eval_mm_niah_ring_attn_256/configuration_internlm2.py b/eval_mm_niah_ring_attn_256/configuration_internlm2.py deleted file mode 100644 index 282b13b1e2066ecc074ecae87b35a19d251f0ed7..0000000000000000000000000000000000000000 --- a/eval_mm_niah_ring_attn_256/configuration_internlm2.py +++ /dev/null @@ -1,150 +0,0 @@ -# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved. -# -# This code is based on transformers/src/transformers/models/llama/configuration_llama.py -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" InternLM2 model configuration""" - -from transformers.configuration_utils import PretrainedConfig -from transformers.utils import logging - -logger = logging.get_logger(__name__) - -INTERNLM2_PRETRAINED_CONFIG_ARCHIVE_MAP = {} - - -# Modified from transformers.model.llama.configuration_llama.LlamaConfig -class InternLM2Config(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`InternLM2Model`]. It is used to instantiate - an InternLM2 model according to the specified arguments, defining the model architecture. Instantiating a - configuration with the defaults will yield a similar configuration to that of the InternLM2-7B. - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - - Args: - vocab_size (`int`, *optional*, defaults to 32000): - Vocabulary size of the InternLM2 model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`InternLM2Model`] - hidden_size (`int`, *optional*, defaults to 4096): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 11008): - Dimension of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 32): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 32): - Number of attention heads for each attention layer in the Transformer encoder. - num_key_value_heads (`int`, *optional*): - This is the number of key_value heads that should be used to implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if - `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When - converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this - paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to - `num_attention_heads`. - hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): - The non-linear activation function (function or string) in the decoder. - max_position_embeddings (`int`, *optional*, defaults to 2048): - The maximum sequence length that this model might ever be used with. Typically set this to something large - just in case (e.g., 512 or 1024 or 2048). - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - rms_norm_eps (`float`, *optional*, defaults to 1e-12): - The epsilon used by the rms normalization layers. - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if `config.is_decoder=True`. - tie_word_embeddings(`bool`, *optional*, defaults to `False`): - Whether to tie weight embeddings - Example: - - """ - model_type = 'internlm2' - _auto_class = 'AutoConfig' - - def __init__( # pylint: disable=W0102 - self, - vocab_size=103168, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act='silu', - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - bias=True, - rope_theta=10000, - rope_scaling=None, - attn_implementation='eager', - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.bias = bias - - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - self.num_key_value_heads = num_key_value_heads - - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling - self._rope_scaling_validation() - - self.attn_implementation = attn_implementation - if self.attn_implementation is None: - self.attn_implementation = 'eager' - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - - def _rope_scaling_validation(self): - """ - Validate the `rope_scaling` configuration. - """ - if self.rope_scaling is None: - return - - if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: - raise ValueError( - '`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, ' - f'got {self.rope_scaling}' - ) - rope_scaling_type = self.rope_scaling.get('type', None) - rope_scaling_factor = self.rope_scaling.get('factor', None) - if rope_scaling_type is None or rope_scaling_type not in ['linear', 'dynamic']: - raise ValueError( - f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" - ) - if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor < 1.0: - raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}") diff --git a/eval_mm_niah_ring_attn_256/configuration_internvl_chat.py b/eval_mm_niah_ring_attn_256/configuration_internvl_chat.py deleted file mode 100644 index b5a518b7883535e2038fcd2d2fdd32f3c14da5ee..0000000000000000000000000000000000000000 --- a/eval_mm_niah_ring_attn_256/configuration_internvl_chat.py +++ /dev/null @@ -1,96 +0,0 @@ -# -------------------------------------------------------- -# InternVL -# Copyright (c) 2024 OpenGVLab -# Licensed under The MIT License [see LICENSE for details] -# -------------------------------------------------------- - -import copy - -from transformers import AutoConfig, LlamaConfig -from transformers.configuration_utils import PretrainedConfig -from transformers.utils import logging - -from .configuration_intern_vit import InternVisionConfig -from .configuration_internlm2 import InternLM2Config - -logger = logging.get_logger(__name__) - - -class InternVLChatConfig(PretrainedConfig): - model_type = 'internvl_chat' - is_composition = True - - def __init__( - self, - vision_config=None, - llm_config=None, - use_backbone_lora=0, - use_llm_lora=0, - select_layer=-1, - force_image_size=None, - downsample_ratio=0.5, - template=None, - dynamic_image_size=False, - use_thumbnail=False, - ps_version='v1', - min_dynamic_patch=1, - max_dynamic_patch=6, - **kwargs): - super().__init__(**kwargs) - - if vision_config is None: - vision_config = {} - logger.info('vision_config is None. Initializing the InternVisionConfig with default values.') - - if llm_config is None: - llm_config = {} - logger.info('llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).') - - self.vision_config = InternVisionConfig(**vision_config) - if llm_config['architectures'][0] == 'LlamaForCausalLM': - self.llm_config = LlamaConfig(**llm_config) - elif llm_config['architectures'][0] == 'InternLM2ForCausalLM': - self.llm_config = InternLM2Config(**llm_config) - else: - raise ValueError('Unsupported architecture: {}'.format(llm_config['architectures'][0])) - self.use_backbone_lora = use_backbone_lora - self.use_llm_lora = use_llm_lora - self.select_layer = select_layer - self.force_image_size = force_image_size - self.downsample_ratio = downsample_ratio - self.template = template - self.dynamic_image_size = dynamic_image_size - self.use_thumbnail = use_thumbnail - self.ps_version = ps_version # pixel shuffle version - self.min_dynamic_patch = min_dynamic_patch - self.max_dynamic_patch = max_dynamic_patch - - logger.info(f'vision_select_layer: {self.select_layer}') - logger.info(f'ps_version: {self.ps_version}') - logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}') - logger.info(f'max_dynamic_patch: {self.max_dynamic_patch}') - - def to_dict(self): - """ - Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. - - Returns: - `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, - """ - output = copy.deepcopy(self.__dict__) - output['vision_config'] = self.vision_config.to_dict() - output['llm_config'] = self.llm_config.to_dict() - output['model_type'] = self.__class__.model_type - output['use_backbone_lora'] = self.use_backbone_lora - output['use_llm_lora'] = self.use_llm_lora - output['select_layer'] = self.select_layer - output['force_image_size'] = self.force_image_size - output['downsample_ratio'] = self.downsample_ratio - output['template'] = self.template - output['dynamic_image_size'] = self.dynamic_image_size - output['use_thumbnail'] = self.use_thumbnail - output['ps_version'] = self.ps_version - output['min_dynamic_patch'] = self.min_dynamic_patch - output['max_dynamic_patch'] = self.max_dynamic_patch - - return output diff --git a/eval_mm_niah_ring_attn_256/conversation.py b/eval_mm_niah_ring_attn_256/conversation.py deleted file mode 100644 index 2fe37ad08c18c49fd5a4d7e0aa9be10fbeead22c..0000000000000000000000000000000000000000 --- a/eval_mm_niah_ring_attn_256/conversation.py +++ /dev/null @@ -1,393 +0,0 @@ -""" -Conversation prompt templates. - -We kindly request that you import fastchat instead of copying this file if you wish to use it. -If you have changes in mind, please contribute back so the community can benefit collectively and continue to maintain these valuable templates. -""" - -import dataclasses -from enum import IntEnum, auto -from typing import Any, Dict, List, Tuple, Union - - -class SeparatorStyle(IntEnum): - """Separator styles.""" - - ADD_COLON_SINGLE = auto() - ADD_COLON_TWO = auto() - ADD_COLON_SPACE_SINGLE = auto() - NO_COLON_SINGLE = auto() - NO_COLON_TWO = auto() - ADD_NEW_LINE_SINGLE = auto() - LLAMA2 = auto() - CHATGLM = auto() - CHATML = auto() - CHATINTERN = auto() - DOLLY = auto() - RWKV = auto() - PHOENIX = auto() - ROBIN = auto() - FALCON_CHAT = auto() - CHATGLM3 = auto() - INTERNVL_ZH = auto() - MPT = auto() - - -@dataclasses.dataclass -class Conversation: - """A class that manages prompt templates and keeps all conversation history.""" - - # The name of this template - name: str - # The template of the system prompt - system_template: str = '{system_message}' - # The system message - system_message: str = '' - # The names of two roles - roles: Tuple[str] = ('USER', 'ASSISTANT') - # All messages. Each item is (role, message). - messages: List[List[str]] = () - # The number of few shot examples - offset: int = 0 - # The separator style and configurations - sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE - sep: str = '\n' - sep2: str = None - # Stop criteria (the default one is EOS token) - stop_str: Union[str, List[str]] = None - # Stops generation if meeting any token in this list - stop_token_ids: List[int] = None - - def get_prompt(self) -> str: - """Get the prompt for generation.""" - system_prompt = self.system_template.format(system_message=self.system_message) - if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE: - ret = system_prompt + self.sep - for role, message in self.messages: - if message: - ret += role + ': ' + message + self.sep - else: - ret += role + ':' - return ret - elif self.sep_style == SeparatorStyle.ADD_COLON_TWO: - seps = [self.sep, self.sep2] - ret = system_prompt + seps[0] - for i, (role, message) in enumerate(self.messages): - if message: - ret += role + ': ' + message + seps[i % 2] - else: - ret += role + ':' - return ret - elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE: - ret = system_prompt + self.sep - for role, message in self.messages: - if message: - ret += role + ': ' + message + self.sep - else: - ret += role + ': ' # must be end with a space - return ret - elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE: - ret = '' if system_prompt == '' else system_prompt + self.sep - for role, message in self.messages: - if message: - ret += role + '\n' + message + self.sep - else: - ret += role + '\n' - return ret - elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE: - ret = system_prompt - for role, message in self.messages: - if message: - ret += role + message + self.sep - else: - ret += role - return ret - elif self.sep_style == SeparatorStyle.NO_COLON_TWO: - seps = [self.sep, self.sep2] - ret = system_prompt - for i, (role, message) in enumerate(self.messages): - if message: - ret += role + message + seps[i % 2] - else: - ret += role - return ret - elif self.sep_style == SeparatorStyle.RWKV: - ret = system_prompt - for i, (role, message) in enumerate(self.messages): - if message: - ret += ( - role - + ': ' - + message.replace('\r\n', '\n').replace('\n\n', '\n') - ) - ret += '\n\n' - else: - ret += role + ':' - return ret - elif self.sep_style == SeparatorStyle.LLAMA2: - seps = [self.sep, self.sep2] - if self.system_message: - ret = system_prompt - else: - ret = '[INST] ' - for i, (role, message) in enumerate(self.messages): - tag = self.roles[i % 2] - if message: - if i == 0: - ret += message + ' ' - else: - ret += tag + ' ' + message + seps[i % 2] - else: - ret += tag - return ret - elif self.sep_style == SeparatorStyle.CHATGLM: - # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308 - # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926 - round_add_n = 1 if self.name == 'chatglm2' else 0 - if system_prompt: - ret = system_prompt + self.sep - else: - ret = '' - - for i, (role, message) in enumerate(self.messages): - if i % 2 == 0: - ret += f'[Round {i//2 + round_add_n}]{self.sep}' - - if message: - ret += f'{role}:{message}{self.sep}' - else: - ret += f'{role}:' - return ret - elif self.sep_style == SeparatorStyle.CHATML: - ret = '' if system_prompt == '' else system_prompt + self.sep + '\n' - for role, message in self.messages: - if message: - ret += role + '\n' + message + self.sep + '\n' - else: - ret += role + '\n' - return ret - elif self.sep_style == SeparatorStyle.CHATGLM3: - ret = '' - if self.system_message: - ret += system_prompt - for role, message in self.messages: - if message: - ret += role + '\n' + ' ' + message - else: - ret += role - return ret - elif self.sep_style == SeparatorStyle.CHATINTERN: - # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771 - seps = [self.sep, self.sep2] - ret = system_prompt - for i, (role, message) in enumerate(self.messages): - # if i % 2 == 0: - # ret += "" - if message: - ret += role + ':' + message + seps[i % 2] + '\n' - else: - ret += role + ':' - return ret - elif self.sep_style == SeparatorStyle.DOLLY: - seps = [self.sep, self.sep2] - ret = system_prompt - for i, (role, message) in enumerate(self.messages): - if message: - ret += role + ':\n' + message + seps[i % 2] - if i % 2 == 1: - ret += '\n\n' - else: - ret += role + ':\n' - return ret - elif self.sep_style == SeparatorStyle.PHOENIX: - ret = system_prompt - for role, message in self.messages: - if message: - ret += role + ': ' + '' + message + '' - else: - ret += role + ': ' + '' - return ret - elif self.sep_style == SeparatorStyle.ROBIN: - ret = system_prompt + self.sep - for role, message in self.messages: - if message: - ret += role + ':\n' + message + self.sep - else: - ret += role + ':\n' - return ret - elif self.sep_style == SeparatorStyle.FALCON_CHAT: - ret = '' - if self.system_message: - ret += system_prompt + self.sep - for role, message in self.messages: - if message: - ret += role + ': ' + message + self.sep - else: - ret += role + ':' - - return ret - elif self.sep_style == SeparatorStyle.INTERNVL_ZH: - seps = [self.sep, self.sep2] - ret = self.system_message + seps[0] - for i, (role, message) in enumerate(self.messages): - if message: - ret += role + ': ' + message + seps[i % 2] - else: - ret += role + ':' - return ret - elif self.sep_style == SeparatorStyle.MPT: - ret = system_prompt + self.sep - for role, message in self.messages: - if message: - if type(message) is tuple: - message, _, _ = message - ret += role + message + self.sep - else: - ret += role - return ret - else: - raise ValueError(f'Invalid style: {self.sep_style}') - - def set_system_message(self, system_message: str): - """Set the system message.""" - self.system_message = system_message - - def append_message(self, role: str, message: str): - """Append a new message.""" - self.messages.append([role, message]) - - def update_last_message(self, message: str): - """Update the last output. - - The last message is typically set to be None when constructing the prompt, - so we need to update it in-place after getting the response from a model. - """ - self.messages[-1][1] = message - - def to_gradio_chatbot(self): - """Convert the conversation to gradio chatbot format.""" - ret = [] - for i, (role, msg) in enumerate(self.messages[self.offset :]): - if i % 2 == 0: - ret.append([msg, None]) - else: - ret[-1][-1] = msg - return ret - - def to_openai_api_messages(self): - """Convert the conversation to OpenAI chat completion format.""" - ret = [{'role': 'system', 'content': self.system_message}] - - for i, (_, msg) in enumerate(self.messages[self.offset :]): - if i % 2 == 0: - ret.append({'role': 'user', 'content': msg}) - else: - if msg is not None: - ret.append({'role': 'assistant', 'content': msg}) - return ret - - def copy(self): - return Conversation( - name=self.name, - system_template=self.system_template, - system_message=self.system_message, - roles=self.roles, - messages=[[x, y] for x, y in self.messages], - offset=self.offset, - sep_style=self.sep_style, - sep=self.sep, - sep2=self.sep2, - stop_str=self.stop_str, - stop_token_ids=self.stop_token_ids, - ) - - def dict(self): - return { - 'template_name': self.name, - 'system_message': self.system_message, - 'roles': self.roles, - 'messages': self.messages, - 'offset': self.offset, - } - - -# A global registry for all conversation templates -conv_templates: Dict[str, Conversation] = {} - - -def register_conv_template(template: Conversation, override: bool = False): - """Register a new conversation template.""" - if not override: - assert ( - template.name not in conv_templates - ), f'{template.name} has been registered.' - - conv_templates[template.name] = template - - -def get_conv_template(name: str) -> Conversation: - """Get a conversation template.""" - return conv_templates[name].copy() - - -# Both Hermes-2 and internlm2-chat are chatml-format conversation templates. The difference -# is that during training, the preprocessing function for the Hermes-2 template doesn't add -# at the beginning of the tokenized sequence, while the internlm2-chat template does. -# Therefore, they are completely equivalent during inference. -register_conv_template( - Conversation( - name='Hermes-2', - system_template='<|im_start|>system\n{system_message}', - # note: The new system prompt was not used here to avoid changes in benchmark performance. - # system_message='我是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。', - system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。', - roles=('<|im_start|>user\n', '<|im_start|>assistant\n'), - sep_style=SeparatorStyle.MPT, - sep='<|im_end|>', - stop_token_ids=[ - 2, - 6, - 7, - 8, - ], - stop_str='<|endoftext|>', - ) -) - - -register_conv_template( - Conversation( - name='internlm2-chat', - system_template='<|im_start|>system\n{system_message}', - # note: The new system prompt was not used here to avoid changes in benchmark performance. - # system_message='我是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。', - system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。', - roles=('<|im_start|>user\n', '<|im_start|>assistant\n'), - sep_style=SeparatorStyle.MPT, - sep='<|im_end|>', - stop_token_ids=[ - 2, - 92543, - 92542 - ] - ) -) - - -register_conv_template( - Conversation( - name='phi3-chat', - system_template='<|system|>\n{system_message}', - # note: The new system prompt was not used here to avoid changes in benchmark performance. - # system_message='我是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。', - system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。', - roles=('<|user|>\n', '<|assistant|>\n'), - sep_style=SeparatorStyle.MPT, - sep='<|end|>', - stop_token_ids=[ - 2, - 32000, - 32007 - ] - ) -) diff --git a/eval_mm_niah_ring_attn_256/modeling_intern_vit.py b/eval_mm_niah_ring_attn_256/modeling_intern_vit.py deleted file mode 100644 index 588c3de46ce4748444ddce4a1bb72cb8de74996f..0000000000000000000000000000000000000000 --- a/eval_mm_niah_ring_attn_256/modeling_intern_vit.py +++ /dev/null @@ -1,429 +0,0 @@ -# -------------------------------------------------------- -# InternVL -# Copyright (c) 2024 OpenGVLab -# Licensed under The MIT License [see LICENSE for details] -# -------------------------------------------------------- -from typing import Optional, Tuple, Union - -import torch -import torch.nn.functional as F -import torch.utils.checkpoint -from einops import rearrange -from timm.models.layers import DropPath -from torch import nn -from transformers.activations import ACT2FN -from transformers.modeling_outputs import (BaseModelOutput, - BaseModelOutputWithPooling) -from transformers.modeling_utils import PreTrainedModel -from transformers.utils import logging - -from .configuration_intern_vit import InternVisionConfig - -try: - from flash_attn.bert_padding import pad_input, unpad_input - from flash_attn.flash_attn_interface import \ - flash_attn_varlen_qkvpacked_func - has_flash_attn = True -except: - print('FlashAttention2 is not installed.') - has_flash_attn = False - -logger = logging.get_logger(__name__) - - -class FlashAttention(nn.Module): - """Implement the scaled dot product attention with softmax. - Arguments - --------- - softmax_scale: The temperature to use for the softmax attention. - (default: 1/sqrt(d_keys) where d_keys is computed at - runtime) - attention_dropout: The dropout rate to apply to the attention - (default: 0.0) - """ - - def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None): - super().__init__() - self.softmax_scale = softmax_scale - self.dropout_p = attention_dropout - - def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None, - max_s=None, need_weights=False): - """Implements the multihead softmax attention. - Arguments - --------- - qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None - if unpadded: (nnz, 3, h, d) - key_padding_mask: a bool tensor of shape (B, S) - """ - assert not need_weights - assert qkv.dtype in [torch.float16, torch.bfloat16] - assert qkv.is_cuda - - if cu_seqlens is None: - batch_size = qkv.shape[0] - seqlen = qkv.shape[1] - if key_padding_mask is None: - qkv = rearrange(qkv, 'b s ... -> (b s) ...') - max_s = seqlen - cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32, - device=qkv.device) - output = flash_attn_varlen_qkvpacked_func( - qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0, - softmax_scale=self.softmax_scale, causal=causal - ) - output = rearrange(output, '(b s) ... -> b s ...', b=batch_size) - else: - nheads = qkv.shape[-2] - x = rearrange(qkv, 'b s three h d -> b s (three h d)') - x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask) - x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads) - output_unpad = flash_attn_varlen_qkvpacked_func( - x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0, - softmax_scale=self.softmax_scale, causal=causal - ) - output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'), - indices, batch_size, seqlen), - 'b s (h d) -> b s h d', h=nheads) - else: - assert max_s is not None - output = flash_attn_varlen_qkvpacked_func( - qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0, - softmax_scale=self.softmax_scale, causal=causal - ) - - return output, None - - -class InternRMSNorm(nn.Module): - def __init__(self, hidden_size, eps=1e-6): - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward(self, hidden_states): - input_dtype = hidden_states.dtype - hidden_states = hidden_states.to(torch.float32) - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) - return self.weight * hidden_states.to(input_dtype) - - -try: - from apex.normalization import FusedRMSNorm - - InternRMSNorm = FusedRMSNorm # noqa - - logger.info('Discovered apex.normalization.FusedRMSNorm - will use it instead of InternRMSNorm') -except ImportError: - # using the normal InternRMSNorm - pass -except Exception: - logger.warning('discovered apex but it failed to load, falling back to InternRMSNorm') - pass - - -NORM2FN = { - 'rms_norm': InternRMSNorm, - 'layer_norm': nn.LayerNorm, -} - - -class InternVisionEmbeddings(nn.Module): - def __init__(self, config: InternVisionConfig): - super().__init__() - self.config = config - self.embed_dim = config.hidden_size - self.image_size = config.image_size - self.patch_size = config.patch_size - - self.class_embedding = nn.Parameter( - torch.randn(1, 1, self.embed_dim), - ) - - self.patch_embedding = nn.Conv2d( - in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size - ) - - self.num_patches = (self.image_size // self.patch_size) ** 2 - self.num_positions = self.num_patches + 1 - - self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim)) - - def _get_pos_embed(self, pos_embed, H, W): - target_dtype = pos_embed.dtype - pos_embed = pos_embed.float().reshape( - 1, self.image_size // self.patch_size, self.image_size // self.patch_size, -1).permute(0, 3, 1, 2) - pos_embed = F.interpolate(pos_embed, size=(H, W), mode='bicubic', align_corners=False). \ - reshape(1, -1, H * W).permute(0, 2, 1).to(target_dtype) - return pos_embed - - def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: - target_dtype = self.patch_embedding.weight.dtype - patch_embeds = self.patch_embedding(pixel_values) # shape = [*, channel, width, height] - batch_size, _, height, width = patch_embeds.shape - patch_embeds = patch_embeds.flatten(2).transpose(1, 2) - class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype) - embeddings = torch.cat([class_embeds, patch_embeds], dim=1) - position_embedding = torch.cat([ - self.position_embedding[:, :1, :], - self._get_pos_embed(self.position_embedding[:, 1:, :], height, width) - ], dim=1) - embeddings = embeddings + position_embedding.to(target_dtype) - return embeddings - - -class InternAttention(nn.Module): - """Multi-headed attention from 'Attention Is All You Need' paper""" - - def __init__(self, config: InternVisionConfig): - super().__init__() - self.config = config - self.embed_dim = config.hidden_size - self.num_heads = config.num_attention_heads - self.use_flash_attn = config.use_flash_attn and has_flash_attn - if config.use_flash_attn and not has_flash_attn: - print('Warning: Flash Attention is not available, use_flash_attn is set to False.') - self.head_dim = self.embed_dim // self.num_heads - if self.head_dim * self.num_heads != self.embed_dim: - raise ValueError( - f'embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:' - f' {self.num_heads}).' - ) - - self.scale = self.head_dim ** -0.5 - self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=config.qkv_bias) - self.attn_drop = nn.Dropout(config.attention_dropout) - self.proj_drop = nn.Dropout(config.dropout) - - self.qk_normalization = config.qk_normalization - - if self.qk_normalization: - self.q_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps) - self.k_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps) - - if self.use_flash_attn: - self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout) - self.proj = nn.Linear(self.embed_dim, self.embed_dim) - - def _naive_attn(self, x): - B, N, C = x.shape - qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) - q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple) - - if self.qk_normalization: - B_, H_, N_, D_ = q.shape - q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2) - k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2) - - attn = ((q * self.scale) @ k.transpose(-2, -1)) - attn = attn.softmax(dim=-1) - attn = self.attn_drop(attn) - - x = (attn @ v).transpose(1, 2).reshape(B, N, C) - x = self.proj(x) - x = self.proj_drop(x) - return x - - def _flash_attn(self, x, key_padding_mask=None, need_weights=False): - qkv = self.qkv(x) - qkv = rearrange(qkv, 'b s (three h d) -> b s three h d', three=3, h=self.num_heads) - - if self.qk_normalization: - q, k, v = qkv.unbind(2) - q = self.q_norm(q.flatten(-2, -1)).view(q.shape) - k = self.k_norm(k.flatten(-2, -1)).view(k.shape) - qkv = torch.stack([q, k, v], dim=2) - - context, _ = self.inner_attn( - qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=False - ) - outs = self.proj(rearrange(context, 'b s h d -> b s (h d)')) - outs = self.proj_drop(outs) - return outs - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - x = self._naive_attn(hidden_states) if not self.use_flash_attn else self._flash_attn(hidden_states) - return x - - -class InternMLP(nn.Module): - def __init__(self, config: InternVisionConfig): - super().__init__() - self.config = config - self.act = ACT2FN[config.hidden_act] - self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) - self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.fc1(hidden_states) - hidden_states = self.act(hidden_states) - hidden_states = self.fc2(hidden_states) - return hidden_states - - -class InternVisionEncoderLayer(nn.Module): - def __init__(self, config: InternVisionConfig, drop_path_rate: float): - super().__init__() - self.embed_dim = config.hidden_size - self.intermediate_size = config.intermediate_size - self.norm_type = config.norm_type - - self.attn = InternAttention(config) - self.mlp = InternMLP(config) - self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps) - self.norm2 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps) - - self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim)) - self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim)) - self.drop_path1 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity() - self.drop_path2 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity() - - def forward( - self, - hidden_states: torch.Tensor, - ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Tuple[torch.FloatTensor]]]: - """ - Args: - hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)` - """ - hidden_states = hidden_states + self.drop_path1(self.attn(self.norm1(hidden_states).to(hidden_states.dtype)) * self.ls1) - - hidden_states = hidden_states + self.drop_path2(self.mlp(self.norm2(hidden_states).to(hidden_states.dtype)) * self.ls2) - - return hidden_states - - -class InternVisionEncoder(nn.Module): - """ - Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a - [`InternEncoderLayer`]. - - Args: - config (`InternConfig`): - The corresponding vision configuration for the `InternEncoder`. - """ - - def __init__(self, config: InternVisionConfig): - super().__init__() - self.config = config - # stochastic depth decay rule - dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)] - self.layers = nn.ModuleList([ - InternVisionEncoderLayer(config, dpr[idx]) for idx in range(config.num_hidden_layers)]) - self.gradient_checkpointing = True - - def forward( - self, - inputs_embeds, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: - r""" - Args: - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): - Embedded representation of the inputs. Should be float, not int tokens. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - """ - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - encoder_states = () if output_hidden_states else None - hidden_states = inputs_embeds - - for idx, encoder_layer in enumerate(self.layers): - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - if self.gradient_checkpointing and self.training: - layer_outputs = torch.utils.checkpoint.checkpoint( - encoder_layer, - hidden_states) - else: - layer_outputs = encoder_layer( - hidden_states, - ) - hidden_states = layer_outputs - - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states] if v is not None) - return BaseModelOutput( - last_hidden_state=hidden_states, hidden_states=encoder_states - ) - - -class InternVisionModel(PreTrainedModel): - main_input_name = 'pixel_values' - _supports_flash_attn_2 = True - config_class = InternVisionConfig - _no_split_modules = ['InternVisionEncoderLayer'] - - def __init__(self, config: InternVisionConfig): - super().__init__(config) - self.config = config - - self.embeddings = InternVisionEmbeddings(config) - self.encoder = InternVisionEncoder(config) - - def resize_pos_embeddings(self, old_size, new_size, patch_size): - pos_emb = self.embeddings.position_embedding - _, num_positions, embed_dim = pos_emb.shape - cls_emb = pos_emb[:, :1, :] - pos_emb = pos_emb[:, 1:, :].reshape(1, old_size // patch_size, old_size // patch_size, -1).permute(0, 3, 1, 2) - pos_emb = F.interpolate(pos_emb.float(), size=new_size // patch_size, mode='bicubic', align_corners=False) - pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, -1).permute(0, 2, 1) - pos_emb = torch.cat([cls_emb, pos_emb], dim=1) - self.embeddings.position_embedding = nn.Parameter(pos_emb) - self.embeddings.image_size = new_size - logger.info('Resized position embeddings from {} to {}'.format(old_size, new_size)) - - def get_input_embeddings(self): - return self.embeddings - - def forward( - self, - pixel_values: Optional[torch.FloatTensor] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - pixel_embeds: Optional[torch.FloatTensor] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if pixel_values is None and pixel_embeds is None: - raise ValueError('You have to specify pixel_values or pixel_embeds') - - if pixel_embeds is not None: - hidden_states = pixel_embeds - else: - if len(pixel_values.shape) == 4: - hidden_states = self.embeddings(pixel_values) - else: - raise ValueError(f'wrong pixel_values size: {pixel_values.shape}') - encoder_outputs = self.encoder( - inputs_embeds=hidden_states, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - last_hidden_state = encoder_outputs.last_hidden_state - pooled_output = last_hidden_state[:, 0, :] - - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - - return BaseModelOutputWithPooling( - last_hidden_state=last_hidden_state, - pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - ) diff --git a/eval_mm_niah_ring_attn_256/modeling_internlm2.py b/eval_mm_niah_ring_attn_256/modeling_internlm2.py deleted file mode 100644 index 7c8c24d873f6ecd152d00fd65371e23ead981e1d..0000000000000000000000000000000000000000 --- a/eval_mm_niah_ring_attn_256/modeling_internlm2.py +++ /dev/null @@ -1,1415 +0,0 @@ -# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved. -# -# This code is based on transformers/src/transformers/models/llama/modeling_llama.py -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" PyTorch InternLM2 model.""" -import math -import queue -import threading -import warnings -from typing import List, Optional, Tuple, Union - -import torch -import torch.nn.functional as F -import torch.utils.checkpoint -from einops import rearrange -from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss -from transformers.activations import ACT2FN -from transformers.modeling_outputs import (BaseModelOutputWithPast, - CausalLMOutputWithPast, - SequenceClassifierOutputWithPast) -from transformers.modeling_utils import PreTrainedModel -from transformers.utils import (add_start_docstrings, - add_start_docstrings_to_model_forward, logging, - replace_return_docstrings) - -try: - from transformers.generation.streamers import BaseStreamer -except: # noqa # pylint: disable=bare-except - BaseStreamer = None - -from .configuration_internlm2 import InternLM2Config - -logger = logging.get_logger(__name__) - -_CONFIG_FOR_DOC = 'InternLM2Config' - -flash_attn_func, flash_attn_varlen_func = None, None -pad_input, index_first_axis, unpad_input = None, None, None -try: - from flash_attn import flash_attn_func as _flash_attn_func - from flash_attn import flash_attn_varlen_func as _flash_attn_varlen_func - from flash_attn.bert_padding import index_first_axis as _index_first_axis - from flash_attn.bert_padding import pad_input as _pad_input - from flash_attn.bert_padding import unpad_input as _unpad_input - - flash_attn_func, flash_attn_varlen_func = _flash_attn_func, _flash_attn_varlen_func - pad_input, index_first_axis, unpad_input = _pad_input, _index_first_axis, _unpad_input - has_flash_attn = True -except: - has_flash_attn = False - - -def _import_flash_attn(): - global flash_attn_func, flash_attn_varlen_func - global pad_input, index_first_axis, unpad_input - try: - from flash_attn import flash_attn_func as _flash_attn_func - from flash_attn import \ - flash_attn_varlen_func as _flash_attn_varlen_func - from flash_attn.bert_padding import \ - index_first_axis as _index_first_axis - from flash_attn.bert_padding import pad_input as _pad_input - from flash_attn.bert_padding import unpad_input as _unpad_input - flash_attn_func, flash_attn_varlen_func = _flash_attn_func, _flash_attn_varlen_func - pad_input, index_first_axis, unpad_input = _pad_input, _index_first_axis, _unpad_input - except ImportError: - raise ImportError('flash_attn is not installed.') - - -# Copied from transformers.models.llama.modeling_llama._get_unpad_data -def _get_unpad_data(attention_mask): - seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) - indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() - max_seqlen_in_batch = seqlens_in_batch.max().item() - cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)) - return ( - indices, - cu_seqlens, - max_seqlen_in_batch, - ) - - -# Copied from transformers.models.bart.modeling_bart._make_causal_mask -def _make_causal_mask( - input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0 -): - """ - Make causal mask used for bi-directional self-attention. - """ - bsz, tgt_len = input_ids_shape - mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device) - mask_cond = torch.arange(mask.size(-1), device=device) - mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) - mask = mask.to(dtype) - - if past_key_values_length > 0: - mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1) - return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) - - -# Copied from transformers.models.bart.modeling_bart._expand_mask -def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): - """ - Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. - """ - bsz, src_len = mask.size() - tgt_len = tgt_len if tgt_len is not None else src_len - - expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) - - inverted_mask = 1.0 - expanded_mask - - return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) - - -# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->InternLM2 -class InternLM2RMSNorm(nn.Module): - def __init__(self, hidden_size, eps=1e-6): - """ - InternLM2RMSNorm is equivalent to T5LayerNorm - """ - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward(self, hidden_states): - input_dtype = hidden_states.dtype - hidden_states = hidden_states.to(torch.float32) - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) - return self.weight * hidden_states.to(input_dtype) - - -# Copied from transformers.model.llama.modeling_llama.LlamaRotaryEmbedding with Llama->InternLM2 -class InternLM2RotaryEmbedding(nn.Module): - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): - super().__init__() - - self.dim = dim - self.max_position_embeddings = max_position_embeddings - self.base = base - inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) - self.register_buffer('inv_freq', inv_freq, persistent=False) - - # Build here to make `torch.jit.trace` work. - self._set_cos_sin_cache( - seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() - ) - - def _set_cos_sin_cache(self, seq_len, device, dtype): - self.max_seq_len_cached = seq_len - t = torch.arange(self.max_seq_len_cached, device=device).to(dtype=self.inv_freq.dtype) - - freqs = torch.einsum('i,j->ij', t, self.inv_freq) - # Different from paper, but it uses a different permutation in order to obtain the same calculation - emb = torch.cat((freqs, freqs), dim=-1) - self.register_buffer('cos_cached', emb.cos().to(dtype), persistent=False) - self.register_buffer('sin_cached', emb.sin().to(dtype), persistent=False) - - def forward(self, x, seq_len=None): - # x: [bs, num_attention_heads, seq_len, head_size] - if seq_len > self.max_seq_len_cached: - self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=torch.float32) - - return ( - self.cos_cached[:seq_len].to(dtype=x.dtype), - self.sin_cached[:seq_len].to(dtype=x.dtype), - ) - - -# Copied from transformers.model.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->InternLM2 -class InternLM2LinearScalingRotaryEmbedding(InternLM2RotaryEmbedding): - """InternLM2RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" - - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): - self.scaling_factor = scaling_factor - super().__init__(dim, max_position_embeddings, base, device) - - def _set_cos_sin_cache(self, seq_len, device, dtype): - self.max_seq_len_cached = seq_len - t = torch.arange(self.max_seq_len_cached, device=device).to(dtype=self.inv_freq.dtype) - t = t / self.scaling_factor - - freqs = torch.einsum('i,j->ij', t, self.inv_freq) - # Different from paper, but it uses a different permutation in order to obtain the same calculation - emb = torch.cat((freqs, freqs), dim=-1) - self.register_buffer('cos_cached', emb.cos().to(dtype), persistent=False) - self.register_buffer('sin_cached', emb.sin().to(dtype), persistent=False) - - -# Copied from transformers.model.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->InternLM2 -class InternLM2DynamicNTKScalingRotaryEmbedding(InternLM2RotaryEmbedding): - """InternLM2RotaryEmbedding extended with Dynamic NTK scaling. - Credits to the Reddit users /u/bloc97 and /u/emozilla. - """ - - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): - self.scaling_factor = scaling_factor - super().__init__(dim, max_position_embeddings, base, device) - - def _set_cos_sin_cache(self, seq_len, device, dtype): - self.max_seq_len_cached = seq_len - - if seq_len > self.max_position_embeddings: - base = self.base * ( - (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) - ) ** (self.dim / (self.dim - 2)) - inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) - self.register_buffer('inv_freq', inv_freq, persistent=False) - - t = torch.arange(self.max_seq_len_cached, device=device).to(dtype=self.inv_freq.dtype) - - freqs = torch.einsum('i,j->ij', t, self.inv_freq) - # Different from paper, but it uses a different permutation in order to obtain the same calculation - emb = torch.cat((freqs, freqs), dim=-1) - self.register_buffer('cos_cached', emb.cos().to(dtype), persistent=False) - self.register_buffer('sin_cached', emb.sin().to(dtype), persistent=False) - - -# Copied from transformers.model.llama.modeling_llama.rotate_half -def rotate_half(x): - """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) - - -# Copied from transformers.model.llama.modeling_llama.apply_rotary_pos_emb -def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): - """Applies Rotary Position Embedding to the query and key tensors.""" - cos = cos[position_ids].unsqueeze(unsqueeze_dim) - sin = sin[position_ids].unsqueeze(unsqueeze_dim) - q_embed = (q * cos) + (rotate_half(q) * sin) - k_embed = (k * cos) + (rotate_half(k) * sin) - return q_embed, k_embed - - -class InternLM2MLP(nn.Module): - def __init__(self, config): - super().__init__() - self.config = config - self.hidden_size = config.hidden_size - self.intermediate_size = config.intermediate_size - self.w1 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - self.w3 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - self.w2 = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) - self.act_fn = ACT2FN[config.hidden_act] - - def forward(self, x): - down_proj = self.w2(self.act_fn(self.w1(x)) * self.w3(x)) - - return down_proj - - -# Copied from transformers.model.llama.modeling_llama.repeat_kv -def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: - """ - This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, - num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) - """ - batch, num_key_value_heads, slen, head_dim = hidden_states.shape - if n_rep == 1: - return hidden_states - hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) - return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) - - -# Modified from transformers.model.llama.modeling_llama.LlamaAttention -class InternLM2Attention(nn.Module): - """Multi-headed attention from 'Attention Is All You Need' paper""" - - def __init__(self, config: InternLM2Config): - super().__init__() - self.config = config - self.hidden_size = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.hidden_size // self.num_heads - self.num_key_value_heads = config.num_key_value_heads - self.num_key_value_groups = self.num_heads // self.num_key_value_heads - self.max_position_embeddings = config.max_position_embeddings - self.is_causal = True - - if (self.head_dim * self.num_heads) != self.hidden_size: - raise ValueError( - f'hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}' - f' and `num_heads`: {self.num_heads}).' - ) - - self.wqkv = nn.Linear( - self.hidden_size, - (self.num_heads + 2 * self.num_key_value_heads) * self.head_dim, - bias=config.bias, - ) - - self.wo = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.bias) - self._init_rope() - - def _init_rope(self): - if self.config.rope_scaling is None: - self.rotary_emb = InternLM2RotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - base=self.config.rope_theta, - ) - else: - scaling_type = self.config.rope_scaling['type'] - scaling_factor = self.config.rope_scaling['factor'] - if scaling_type == 'dynamic': - self.rotary_emb = InternLM2DynamicNTKScalingRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - base=self.config.rope_theta, - scaling_factor=scaling_factor, - ) - elif scaling_type == 'linear': - self.rotary_emb = InternLM2LinearScalingRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - base=self.config.rope_theta, - scaling_factor=scaling_factor, - ) - else: - raise ValueError("Currently we only support rotary embedding's type being 'dynamic' or 'linear'.") - return self.rotary_emb - - def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): - return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if 'padding_mask' in kwargs: - warnings.warn( - 'Passing `padding_mask` is deprecated and will be removed in v4.37. ' - 'Please make sure use `attention_mask` instead.`' - ) - - bsz, q_len, _ = hidden_states.size() - - qkv_states = self.wqkv(hidden_states) - - qkv_states = rearrange( - qkv_states, - 'b q (h gs d) -> b q h gs d', - gs=2 + self.num_key_value_groups, - d=self.head_dim, - ) - - query_states = qkv_states[..., : self.num_key_value_groups, :] - query_states = rearrange(query_states, 'b q h gs d -> b q (h gs) d') - key_states = qkv_states[..., -2, :] - value_states = qkv_states[..., -1, :] - - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value[0].shape[-2] - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - if past_key_value is not None: - # reuse k, v, self_attention - key_states = torch.cat([past_key_value[0], key_states], dim=2) - value_states = torch.cat([past_key_value[1], value_states], dim=2) - - past_key_value = (key_states, value_states) if use_cache else None - - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - - if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): - raise ValueError( - f'Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is' - f' {attn_weights.size()}' - ) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f'Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}' - ) - attn_weights = attn_weights + attention_mask - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) - attn_output = torch.matmul(attn_weights, value_states) - - if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): - raise ValueError( - f'`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is' - f' {attn_output.size()}' - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) - - attn_output = self.wo(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights, past_key_value - - -# Modified from transformers.model.llama.modeling_llama.InternLM2FlashAttention2 -class InternLM2FlashAttention2(InternLM2Attention): - """ - InternLM2 flash attention module. This module inherits from `InternLM2Attention` as the weights of the module stays - untouched. The only required change would be on the forward pass where it needs to correctly call the public API of - flash attention and deal with padding tokens in case the input contains any of them. - """ - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.LongTensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - # InternLM2FlashAttention2 attention does not support output_attentions - if 'padding_mask' in kwargs: - warnings.warn( - 'Passing `padding_mask` is deprecated and will be removed in v4.37. ' - 'Please make sure use `attention_mask` instead.`' - ) - - # overwrite attention_mask with padding_mask - attention_mask = kwargs.pop('padding_mask') - - output_attentions = False - - bsz, q_len, _ = hidden_states.size() - - qkv_states = self.wqkv(hidden_states) - - qkv_states = rearrange( - qkv_states, - 'b q (h gs d) -> b q h gs d', - gs=2 + self.num_key_value_groups, - d=self.head_dim, - ) - - query_states = qkv_states[..., : self.num_key_value_groups, :] - query_states = rearrange(query_states, 'b q h gs d -> b q (h gs) d') - key_states = qkv_states[..., -2, :] - value_states = qkv_states[..., -1, :] - - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value[0].shape[-2] - - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - if past_key_value is not None: - # reuse k, v, self_attention - key_states = torch.cat([past_key_value[0], key_states], dim=2) - value_states = torch.cat([past_key_value[1], value_states], dim=2) - - past_key_value = (key_states, value_states) if use_cache else None - - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - attn_output = self._flash_attention_forward( - query_states, key_states, value_states, attention_mask, q_len - ) - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() - attn_output = self.wo(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights, past_key_value - - def _flash_attention_forward( - self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None - ): - """ - Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token - first unpad the input, then computes the attention scores and pad the final attention scores. - - Args: - query_states (`torch.Tensor`): - Input query states to be passed to Flash Attention API - key_states (`torch.Tensor`): - Input key states to be passed to Flash Attention API - value_states (`torch.Tensor`): - Input value states to be passed to Flash Attention API - attention_mask (`torch.Tensor`): - The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the - position of padding tokens and 1 for the position of non-padding tokens. - dropout (`int`, *optional*): - Attention dropout - softmax_scale (`float`, *optional*): - The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) - """ - # Contains at least one padding token in the sequence - causal = self.is_causal and query_length != 1 - if attention_mask is not None: - batch_size = query_states.shape[0] - query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._unpad_input( - query_states, key_states, value_states, attention_mask, query_length - ) - - cu_seqlens_q, cu_seqlens_k = cu_seq_lens - max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens - - attn_output_unpad = flash_attn_varlen_func( - query_states, - key_states, - value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=dropout, - softmax_scale=softmax_scale, - causal=causal, - ) - - attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) - else: - attn_output = flash_attn_func( - query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal - ) - - return attn_output - - def _unpad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): - indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) - batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape - - key_layer = index_first_axis( - key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k - ) - value_layer = index_first_axis( - value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k - ) - - if query_length == kv_seq_len: - query_layer = index_first_axis( - query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k - ) - cu_seqlens_q = cu_seqlens_k - max_seqlen_in_batch_q = max_seqlen_in_batch_k - indices_q = indices_k - elif query_length == 1: - max_seqlen_in_batch_q = 1 - cu_seqlens_q = torch.arange( - batch_size + 1, dtype=torch.int32, device=query_layer.device - ) # There is a memcpy here, that is very bad. - indices_q = cu_seqlens_q[:-1] - query_layer = query_layer.squeeze(1) - else: - # The -q_len: slice assumes left padding. - attention_mask = attention_mask[:, -query_length:] - query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) - - return ( - query_layer, - key_layer, - value_layer, - indices_q.to(torch.int64), - (cu_seqlens_q, cu_seqlens_k), - (max_seqlen_in_batch_q, max_seqlen_in_batch_k), - ) - - -INTERNLM2_ATTENTION_CLASSES = { - 'eager': InternLM2Attention, - 'flash_attention_2': InternLM2FlashAttention2, -} - - -# Modified from transformers.model.llama.modeling_llama.LlamaDecoderLayer -class InternLM2DecoderLayer(nn.Module): - def __init__(self, config: InternLM2Config): - super().__init__() - self.hidden_size = config.hidden_size - - self.attention = INTERNLM2_ATTENTION_CLASSES[config.attn_implementation](config=config) - - self.feed_forward = InternLM2MLP(config) - self.attention_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.ffn_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: Optional[bool] = False, - use_cache: Optional[bool] = False, - **kwargs, - ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`, *optional*): - attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, - query_sequence_length, key_sequence_length)` if default attention is used. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding - (see `past_key_values`). - past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states - """ - if 'padding_mask' in kwargs: - warnings.warn( - 'Passing `padding_mask` is deprecated and will be removed in v4.37. ' - 'Please make sure use `attention_mask` instead.`' - ) - - residual = hidden_states - - hidden_states = self.attention_norm(hidden_states) - - # Self Attention - hidden_states, self_attn_weights, present_key_value = self.attention( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - **kwargs, - ) - hidden_states = residual + hidden_states - - # Fully Connected - residual = hidden_states - hidden_states = self.ffn_norm(hidden_states) - hidden_states = self.feed_forward(hidden_states) - hidden_states = residual + hidden_states - - outputs = (hidden_states,) - - if output_attentions: - outputs += (self_attn_weights,) - - if use_cache: - outputs += (present_key_value,) - - return outputs - - -InternLM2_START_DOCSTRING = r""" - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads - etc.) - - This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. - Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage - and behavior. - - Parameters: - config ([`InternLM2Config`]): - Model configuration class with all the parameters of the model. Initializing with a config file does not - load the weights associated with the model, only the configuration. Check out the - [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - - -# Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->InternLM2 -@add_start_docstrings( - 'The bare InternLM2 Model outputting raw hidden-states without any specific head on top.', - InternLM2_START_DOCSTRING, -) -class InternLM2PreTrainedModel(PreTrainedModel): - config_class = InternLM2Config - base_model_prefix = 'model' - supports_gradient_checkpointing = True - _no_split_modules = ['InternLM2DecoderLayer'] - _skip_keys_device_placement = 'past_key_values' - _supports_flash_attn_2 = True - - def _init_weights(self, module): - std = self.config.initializer_range - if isinstance(module, nn.Linear): - module.weight.data.normal_(mean=0.0, std=std) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - - -InternLM2_INPUTS_DOCSTRING = r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide - it. - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - If `past_key_values` is used, optionally only the last `input_ids` have to be input (see - `past_key_values`). - - If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] - and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more - information on the default strategy. - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, - config.n_positions - 1]`. - - [What are position IDs?](../glossary#position-ids) - past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or - when `config.use_cache=True`): - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape - `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape - `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)`. - - Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention - blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. - - If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't - have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` - of shape `(batch_size, sequence_length)`. - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This - is useful if you want more control over how to convert `input_ids` indices into associated vectors than the - model's internal embedding lookup matrix. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see - `past_key_values`). - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - - -# Modified from transformers.model.llama.modeling_llama.LlamaModel -@add_start_docstrings( - 'The bare InternLM2 Model outputting raw hidden-states without any specific head on top.', - InternLM2_START_DOCSTRING, -) -class InternLM2Model(InternLM2PreTrainedModel): - """ - Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`InternLM2DecoderLayer`] - - Args: - config: InternLM2Config - """ - - _auto_class = 'AutoModel' - - def __init__(self, config: InternLM2Config): - super().__init__(config) - self.padding_idx = config.pad_token_id - self.vocab_size = config.vocab_size - self.config = config - if not has_flash_attn: - self.config.attn_implementation = 'eager' - print('Warning: Flash attention is not available, using eager attention instead.') - - self.tok_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) - - self.layers = nn.ModuleList([InternLM2DecoderLayer(config) for _ in range(config.num_hidden_layers)]) - self.norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - self.gradient_checkpointing = False - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.tok_embeddings - - def set_input_embeddings(self, value): - self.tok_embeddings = value - - def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): - # create causal mask - # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - combined_attention_mask = None - if input_shape[-1] > 1: - combined_attention_mask = _make_causal_mask( - input_shape, - inputs_embeds.dtype, - device=inputs_embeds.device, - past_key_values_length=past_key_values_length, - ) - - if attention_mask is not None: - # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to( - inputs_embeds.device - ) - combined_attention_mask = ( - expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask - ) - - return combined_attention_mask - - @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPast]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if self.config.attn_implementation == 'flash_attention_2': - _import_flash_attn() - - # retrieve input_ids and inputs_embeds - if input_ids is not None and inputs_embeds is not None: - raise ValueError('You cannot specify both input_ids and inputs_embeds at the same time') - elif input_ids is not None: - batch_size, seq_length = input_ids.shape[:2] - elif inputs_embeds is not None: - batch_size, seq_length = inputs_embeds.shape[:2] - else: - raise ValueError('You have to specify either input_ids or inputs_embeds') - - seq_length_with_past = seq_length - past_key_values_length = 0 - if past_key_values is not None: - past_key_values_length = past_key_values[0][0].shape[2] - seq_length_with_past = seq_length_with_past + past_key_values_length - - if position_ids is None: - device = input_ids.device if input_ids is not None else inputs_embeds.device - position_ids = torch.arange( - past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device - ) - position_ids = position_ids.unsqueeze(0) - - if inputs_embeds is None: - inputs_embeds = self.tok_embeddings(input_ids) - - if self.config.attn_implementation == 'flash_attention_2': - # 2d mask is passed through the layers - attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None - else: - if attention_mask is None: - attention_mask = torch.ones( - (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device - ) - attention_mask = self._prepare_decoder_attention_mask( - attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length - ) - - # embed positions - hidden_states = inputs_embeds - - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...' - ) - use_cache = False - - # decoder layers - all_hidden_states = () if output_hidden_states else None - all_self_attns = () if output_attentions else None - next_decoder_cache = () if use_cache else None - - for idx, decoder_layer in enumerate(self.layers): - if output_hidden_states: - all_hidden_states += (hidden_states,) - - past_key_value = past_key_values[idx] if past_key_values is not None else None - - if self.gradient_checkpointing and self.training: - - def create_custom_forward(module): - def custom_forward(*inputs): - # None for past_key_value - return module(*inputs, output_attentions, None) - - return custom_forward - - layer_outputs = torch.utils.checkpoint.checkpoint( - create_custom_forward(decoder_layer), - hidden_states, - attention_mask, - position_ids, - None, - ) - else: - layer_outputs = decoder_layer( - hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - ) - - hidden_states = layer_outputs[0] - - if use_cache: - next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) - - if output_attentions: - all_self_attns += (layer_outputs[1],) - - hidden_states = self.norm(hidden_states) - - # add hidden states from the last decoder layer - if output_hidden_states: - all_hidden_states += (hidden_states,) - - next_cache = next_decoder_cache if use_cache else None - if not return_dict: - return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) - return BaseModelOutputWithPast( - last_hidden_state=hidden_states, - past_key_values=next_cache, - hidden_states=all_hidden_states, - attentions=all_self_attns, - ) - - -# Modified from transformers.model.llama.modeling_llama.LlamaForCausalLM -class InternLM2ForCausalLM(InternLM2PreTrainedModel): - _auto_class = 'AutoModelForCausalLM' - - _tied_weights_keys = ['output.weight'] - - def __init__(self, config): - super().__init__(config) - self.model = InternLM2Model(config) - self.vocab_size = config.vocab_size - self.output = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.model.tok_embeddings - - def set_input_embeddings(self, value): - self.model.tok_embeddings = value - - def get_output_embeddings(self): - return self.output - - def set_output_embeddings(self, new_embeddings): - self.output = new_embeddings - - def set_decoder(self, decoder): - self.model = decoder - - def get_decoder(self): - return self.model - - @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, CausalLMOutputWithPast]: - r""" - Args: - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - - Returns: - - Example: - - ```python - >>> from transformers import AutoTokenizer, InternLM2ForCausalLM - - >>> model = InternLM2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) - >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) - - >>> prompt = "Hey, are you conscious? Can you talk to me?" - >>> inputs = tokenizer(prompt, return_tensors="pt") - - >>> # Generate - >>> generate_ids = model.generate(inputs.input_ids, max_length=30) - >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." - ```""" - - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) - outputs = self.model( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - hidden_states = outputs[0] - logits = self.output(hidden_states) - logits = logits.float() - - loss = None - if labels is not None: - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) - - if not return_dict: - output = (logits,) + outputs[1:] - return (loss,) + output if loss is not None else output - - device = input_ids.device if input_ids is not None else inputs_embeds.device - output = CausalLMOutputWithPast( - loss=loss, - logits=logits, - past_key_values=outputs.past_key_values, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - output['logits'] = output['logits'].to(device) - return output - - def prepare_inputs_for_generation( - self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs - ): - if past_key_values is not None: - past_length = past_key_values[0][0].shape[2] - - # Some generation methods already pass only the last input ID - if input_ids.shape[1] > past_length: - remove_prefix_length = past_length - else: - # Default to old behavior: keep only final ID - remove_prefix_length = input_ids.shape[1] - 1 - - input_ids = input_ids[:, remove_prefix_length:] - - position_ids = kwargs.get('position_ids', None) - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and past_key_values is None: - model_inputs = {'inputs_embeds': inputs_embeds} - else: - model_inputs = {'input_ids': input_ids} - - model_inputs.update( - { - 'position_ids': position_ids, - 'past_key_values': past_key_values, - 'use_cache': kwargs.get('use_cache'), - 'attention_mask': attention_mask, - } - ) - return model_inputs - - @staticmethod - def _reorder_cache(past_key_values, beam_idx): - reordered_past = () - for layer_past in past_key_values: - reordered_past += ( - tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), - ) - return reordered_past - - def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=''): - if tokenizer.add_bos_token: - prompt = '' - else: - prompt = tokenizer.bos_token - if meta_instruction: - prompt += f"""<|im_start|>system\n{meta_instruction}<|im_end|>\n""" - for record in history: - prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n""" - prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n""" - return tokenizer([prompt], return_tensors='pt') - - @torch.no_grad() - def chat( - self, - tokenizer, - query: str, - history: List[Tuple[str, str]] = [], - streamer: Optional[BaseStreamer] = None, - max_new_tokens: int = 1024, - do_sample: bool = True, - temperature: float = 0.8, - top_p: float = 0.8, - meta_instruction: str = 'You are an AI assistant whose name is InternLM (书生·浦语).\n' - '- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n' - '- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.', - **kwargs, - ): - inputs = self.build_inputs(tokenizer, query, history, meta_instruction) - inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)} - # also add end-of-assistant token in eos token id to avoid unnecessary generation - eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(['<|im_end|>'])[0]] - outputs = self.generate( - **inputs, - streamer=streamer, - max_new_tokens=max_new_tokens, - do_sample=do_sample, - temperature=temperature, - top_p=top_p, - eos_token_id=eos_token_id, - **kwargs, - ) - outputs = outputs[0].cpu().tolist()[len(inputs['input_ids'][0]) :] - response = tokenizer.decode(outputs, skip_special_tokens=True) - response = response.split('<|im_end|>')[0] - history = history + [(query, response)] - return response, history - - @torch.no_grad() - def stream_chat( - self, - tokenizer, - query: str, - history: List[Tuple[str, str]] = [], - max_new_tokens: int = 1024, - do_sample: bool = True, - temperature: float = 0.8, - top_p: float = 0.8, - **kwargs, - ): - """ - Return a generator in format: (response, history) - Eg. - ('你好,有什么可以帮助您的吗', [('你好', '你好,有什么可以帮助您的吗')]) - ('你好,有什么可以帮助您的吗?', [('你好', '你好,有什么可以帮助您的吗?')]) - """ - if BaseStreamer is None: - raise ModuleNotFoundError( - 'The version of `transformers` is too low. Please make sure ' - 'that you have installed `transformers>=4.28.0`.' - ) - - response_queue = queue.Queue(maxsize=20) - - class ChatStreamer(BaseStreamer): - def __init__(self, tokenizer) -> None: - super().__init__() - self.tokenizer = tokenizer - self.queue = response_queue - self.query = query - self.history = history - self.response = '' - self.cache = [] - self.received_inputs = False - self.queue.put((self.response, history + [(self.query, self.response)])) - - def put(self, value): - if len(value.shape) > 1 and value.shape[0] > 1: - raise ValueError('ChatStreamer only supports batch size 1') - elif len(value.shape) > 1: - value = value[0] - - if not self.received_inputs: - # The first received value is input_ids, ignore here - self.received_inputs = True - return - - self.cache.extend(value.tolist()) - token = self.tokenizer.decode(self.cache, skip_special_tokens=True) - if token.strip() != '<|im_end|>': - self.response = self.response + token - history = self.history + [(self.query, self.response)] - self.queue.put((self.response, history)) - self.cache = [] - else: - self.end() - - def end(self): - self.queue.put(None) - - def stream_producer(): - return self.chat( - tokenizer=tokenizer, - query=query, - streamer=ChatStreamer(tokenizer=tokenizer), - history=history, - max_new_tokens=max_new_tokens, - do_sample=do_sample, - temperature=temperature, - top_p=top_p, - **kwargs, - ) - - def consumer(): - producer = threading.Thread(target=stream_producer) - producer.start() - while True: - res = response_queue.get() - if res is None: - return - yield res - - return consumer() - - -# Copied from transformers.model.llama.modeling_llama.LlamaForSequenceClassification with Llama->InternLM2 -@add_start_docstrings( - """ - The InternLM2 Model transformer with a sequence classification head on top (linear layer). - - [`InternLM2ForSequenceClassification`] uses the last token in order to do the classification, - as other causal models (e.g. GPT-2) do. - - Since it does classification on the last token, it requires to know the position of the last token. If a - `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If - no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the - padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in - each row of the batch). - """, - InternLM2_START_DOCSTRING, -) -class InternLM2ForSequenceClassification(InternLM2PreTrainedModel): - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - self.model = InternLM2Model(config) - self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.model.tok_embeddings - - def set_input_embeddings(self, value): - self.model.tok_embeddings = value - - @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, SequenceClassifierOutputWithPast]: - r""" - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - transformer_outputs = self.model( - input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - hidden_states = transformer_outputs[0] - logits = self.score(hidden_states) - - if input_ids is not None: - batch_size = input_ids.shape[0] - else: - batch_size = inputs_embeds.shape[0] - - if self.config.pad_token_id is None and batch_size != 1: - raise ValueError('Cannot handle batch sizes > 1 if no padding token is defined.') - if self.config.pad_token_id is None: - sequence_lengths = -1 - else: - if input_ids is not None: - sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1).to( - logits.device - ) - else: - sequence_lengths = -1 - - pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] - - loss = None - if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = 'regression' - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = 'single_label_classification' - else: - self.config.problem_type = 'multi_label_classification' - - if self.config.problem_type == 'regression': - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == 'single_label_classification': - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == 'multi_label_classification': - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) - if not return_dict: - output = (pooled_logits,) + transformer_outputs[1:] - return ((loss,) + output) if loss is not None else output - - return SequenceClassifierOutputWithPast( - loss=loss, - logits=pooled_logits, - past_key_values=transformer_outputs.past_key_values, - hidden_states=transformer_outputs.hidden_states, - attentions=transformer_outputs.attentions, - ) diff --git a/eval_mm_niah_ring_attn_256/modeling_internvl_chat.py b/eval_mm_niah_ring_attn_256/modeling_internvl_chat.py deleted file mode 100644 index 47e91bc30c3082419899e19f09afa9ad40e275e7..0000000000000000000000000000000000000000 --- a/eval_mm_niah_ring_attn_256/modeling_internvl_chat.py +++ /dev/null @@ -1,350 +0,0 @@ -# -------------------------------------------------------- -# InternVL -# Copyright (c) 2024 OpenGVLab -# Licensed under The MIT License [see LICENSE for details] -# -------------------------------------------------------- -import warnings -from typing import Any, List, Optional, Tuple, Union - -import torch.utils.checkpoint -import transformers -from torch import nn -from torch.nn import CrossEntropyLoss -from transformers import (AutoModel, GenerationConfig, LlamaForCausalLM, - LlamaTokenizer) -from transformers.modeling_outputs import CausalLMOutputWithPast -from transformers.modeling_utils import PreTrainedModel -from transformers.utils import ModelOutput, logging - -from .configuration_internvl_chat import InternVLChatConfig -from .conversation import get_conv_template -from .modeling_intern_vit import InternVisionModel, has_flash_attn -from .modeling_internlm2 import InternLM2ForCausalLM - -logger = logging.get_logger(__name__) - - -def version_cmp(v1, v2, op='eq'): - import operator - - from packaging import version - op_func = getattr(operator, op) - return op_func(version.parse(v1), version.parse(v2)) - - -class InternVLChatModel(PreTrainedModel): - config_class = InternVLChatConfig - main_input_name = 'pixel_values' - base_model_prefix = 'language_model' - _supports_flash_attn_2 = True - _no_split_modules = ['InternVisionModel', 'LlamaDecoderLayer', 'InternLM2DecoderLayer'] - - def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None, use_flash_attn=True): - super().__init__(config) - - assert version_cmp(transformers.__version__, '4.36.2', 'ge') - image_size = config.force_image_size or config.vision_config.image_size - patch_size = config.vision_config.patch_size - self.patch_size = patch_size - self.select_layer = config.select_layer - self.template = config.template - self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2)) - self.downsample_ratio = config.downsample_ratio - self.ps_version = config.ps_version - use_flash_attn = use_flash_attn if has_flash_attn else False - config.vision_config.use_flash_attn = True if use_flash_attn else False - config.llm_config.attn_implementation = 'flash_attention_2' if use_flash_attn else 'eager' - - logger.info(f'num_image_token: {self.num_image_token}') - logger.info(f'ps_version: {self.ps_version}') - if vision_model is not None: - self.vision_model = vision_model - else: - self.vision_model = InternVisionModel(config.vision_config) - if language_model is not None: - self.language_model = language_model - else: - if config.llm_config.architectures[0] == 'LlamaForCausalLM': - self.language_model = LlamaForCausalLM(config.llm_config) - elif config.llm_config.architectures[0] == 'InternLM2ForCausalLM': - self.language_model = InternLM2ForCausalLM(config.llm_config) - else: - raise NotImplementedError(f'{config.llm_config.architectures[0]} is not implemented.') - - vit_hidden_size = config.vision_config.hidden_size - llm_hidden_size = config.llm_config.hidden_size - - self.mlp1 = nn.Sequential( - nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2), - nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size), - nn.GELU(), - nn.Linear(llm_hidden_size, llm_hidden_size) - ) - - self.img_context_token_id = None - self.conv_template = get_conv_template(self.template) - self.system_message = self.conv_template.system_message - - def forward( - self, - pixel_values: torch.FloatTensor, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - image_flags: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, CausalLMOutputWithPast]: - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - image_flags = image_flags.squeeze(-1) - input_embeds = self.language_model.get_input_embeddings()(input_ids).clone() - - vit_embeds = self.extract_feature(pixel_values) - vit_embeds = vit_embeds[image_flags == 1] - vit_batch_size = pixel_values.shape[0] - - B, N, C = input_embeds.shape - input_embeds = input_embeds.reshape(B * N, C) - - if torch.distributed.get_rank() == 0: - print(f'dynamic ViT batch size: {vit_batch_size}, images per sample: {vit_batch_size / B}, dynamic token length: {N}') - - input_ids = input_ids.reshape(B * N) - selected = (input_ids == self.img_context_token_id) - try: - input_embeds[selected] = input_embeds[selected] * 0.0 + vit_embeds.reshape(-1, C) - except Exception as e: - vit_embeds = vit_embeds.reshape(-1, C) - print(f'warning: {e}, input_embeds[selected].shape={input_embeds[selected].shape}, ' - f'vit_embeds.shape={vit_embeds.shape}') - n_token = selected.sum() - input_embeds[selected] = input_embeds[selected] * 0.0 + vit_embeds[:n_token] - - input_embeds = input_embeds.reshape(B, N, C) - - outputs = self.language_model( - inputs_embeds=input_embeds, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - logits = outputs.logits - - loss = None - if labels is not None: - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.view(-1, self.language_model.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) - - if not return_dict: - output = (logits,) + outputs[1:] - return (loss,) + output if loss is not None else output - - return CausalLMOutputWithPast( - loss=loss, - logits=logits, - past_key_values=outputs.past_key_values, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - def pixel_shuffle(self, x, scale_factor=0.5): - n, w, h, c = x.size() - # N, W, H, C --> N, W, H * scale, C // scale - x = x.view(n, w, int(h * scale_factor), int(c / scale_factor)) - # N, W, H * scale, C // scale --> N, H * scale, W, C // scale - x = x.permute(0, 2, 1, 3).contiguous() - # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2) - x = x.view(n, int(h * scale_factor), int(w * scale_factor), - int(c / (scale_factor * scale_factor))) - if self.ps_version == 'v1': - warnings.warn("In ps_version 'v1', the height and width have not been swapped back, " - 'which results in a transposed image.') - else: - x = x.permute(0, 2, 1, 3).contiguous() - return x - - def extract_feature(self, pixel_values): - if self.select_layer == -1: - vit_embeds = self.vision_model( - pixel_values=pixel_values, - output_hidden_states=False, - return_dict=True).last_hidden_state - else: - vit_embeds = self.vision_model( - pixel_values=pixel_values, - output_hidden_states=True, - return_dict=True).hidden_states[self.select_layer] - vit_embeds = vit_embeds[:, 1:, :] - - h = w = int(vit_embeds.shape[1] ** 0.5) - vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1) - vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio) - vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1]) - vit_embeds = self.mlp1(vit_embeds) - return vit_embeds - - def batch_chat(self, tokenizer, pixel_values, questions, generation_config, num_patches_list=None, - history=None, return_history=False, IMG_START_TOKEN='', IMG_END_TOKEN='', - IMG_CONTEXT_TOKEN='', verbose=False, image_counts=None): - if history is not None or return_history: - print('Now multi-turn chat is not supported in batch_chat.') - raise NotImplementedError - - if image_counts is not None: - num_patches_list = image_counts - print('Warning: `image_counts` is deprecated. Please use `num_patches_list` instead.') - - img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN) - self.img_context_token_id = img_context_token_id - - if verbose and pixel_values is not None: - image_bs = pixel_values.shape[0] - print(f'dynamic ViT batch size: {image_bs}') - - queries = [] - for idx, num_patches in enumerate(num_patches_list): - question = questions[idx] - if pixel_values is not None and '' not in question: - question = '\n' + question - template = get_conv_template(self.template) - template.system_message = self.system_message - template.append_message(template.roles[0], question) - template.append_message(template.roles[1], None) - query = template.get_prompt() - - image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN - query = query.replace('', image_tokens, 1) - queries.append(query) - - tokenizer.padding_side = 'left' - model_inputs = tokenizer(queries, return_tensors='pt', padding=True) - input_ids = model_inputs['input_ids'].to(self.device) - attention_mask = model_inputs['attention_mask'].to(self.device) - eos_token_id = tokenizer.convert_tokens_to_ids(template.sep) - generation_config['eos_token_id'] = eos_token_id - generation_output = self.generate( - pixel_values=pixel_values, - input_ids=input_ids, - attention_mask=attention_mask, - **generation_config - ) - responses = tokenizer.batch_decode(generation_output, skip_special_tokens=True) - responses = [response.split(template.sep)[0].strip() for response in responses] - return responses - - def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False, - num_patches_list=None, IMG_START_TOKEN='', IMG_END_TOKEN='', IMG_CONTEXT_TOKEN='', - verbose=False): - - if history is None and pixel_values is not None and '' not in question: - question = '\n' + question - - if num_patches_list is None: - num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else [] - assert pixel_values is None or len(pixel_values) == sum(num_patches_list) - - img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN) - self.img_context_token_id = img_context_token_id - - template = get_conv_template(self.template) - template.system_message = self.system_message - eos_token_id = tokenizer.convert_tokens_to_ids(template.sep) - - history = [] if history is None else history - for (old_question, old_answer) in history: - template.append_message(template.roles[0], old_question) - template.append_message(template.roles[1], old_answer) - template.append_message(template.roles[0], question) - template.append_message(template.roles[1], None) - query = template.get_prompt() - - if verbose and pixel_values is not None: - image_bs = pixel_values.shape[0] - print(f'dynamic ViT batch size: {image_bs}') - - for num_patches in num_patches_list: - image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN - query = query.replace('', image_tokens, 1) - - model_inputs = tokenizer(query, return_tensors='pt') - input_ids = model_inputs['input_ids'].to(self.device) - attention_mask = model_inputs['attention_mask'].to(self.device) - generation_config['eos_token_id'] = eos_token_id - generation_output = self.generate( - pixel_values=pixel_values, - input_ids=input_ids, - attention_mask=attention_mask, - **generation_config - ) - response = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0] - response = response.split(template.sep)[0].strip() - history.append((question, response)) - if return_history: - return response, history - else: - query_to_print = query.replace(IMG_CONTEXT_TOKEN, '') - query_to_print = query_to_print.replace(f'{IMG_START_TOKEN}{IMG_END_TOKEN}', '') - if verbose: - print(query_to_print, response) - return response - - @torch.no_grad() - def generate( - self, - pixel_values: Optional[torch.FloatTensor] = None, - input_ids: Optional[torch.FloatTensor] = None, - attention_mask: Optional[torch.LongTensor] = None, - visual_features: Optional[torch.FloatTensor] = None, - generation_config: Optional[GenerationConfig] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - **generate_kwargs, - ) -> torch.LongTensor: - - assert self.img_context_token_id is not None - if pixel_values is not None: - if visual_features is not None: - vit_embeds = visual_features - else: - vit_embeds = self.extract_feature(pixel_values) - input_embeds = self.language_model.get_input_embeddings()(input_ids) - B, N, C = input_embeds.shape - input_embeds = input_embeds.reshape(B * N, C) - - input_ids = input_ids.reshape(B * N) - selected = (input_ids == self.img_context_token_id) - assert selected.sum() != 0 - input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device) - - input_embeds = input_embeds.reshape(B, N, C) - else: - input_embeds = self.language_model.get_input_embeddings()(input_ids) - - outputs = self.language_model.generate( - inputs_embeds=input_embeds, - attention_mask=attention_mask, - generation_config=generation_config, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - use_cache=True, - **generate_kwargs, - ) - - return outputs diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-128k_stride_256.log b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-128k_stride_256.log deleted file mode 100644 index 6bb39b0c4212f95cce0aaf500ab13a4b837b7edb..0000000000000000000000000000000000000000 --- a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-128k_stride_256.log +++ /dev/null @@ -1,33 +0,0 @@ - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 184 - if args.interp=='linear' - ^ - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 184 - if args.interp=='linear' - ^ -SyntaxError: expected ':' - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 184 - if args.interp=='linear' - ^ -SyntaxError: expected ':' - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 184 - if args.interp=='linear' - ^ -SyntaxError: expected ':' - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 184 - if args.interp=='linear' - ^ -SyntaxError: expected ':' -SyntaxError: expected ':' - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 184 - if args.interp=='linear' - ^ -SyntaxError: expected ':' - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 184 - if args.interp=='linear' - ^ -SyntaxError: expected ':' - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 184 - if args.interp=='linear' - ^ -SyntaxError: expected ':' -srun: error: HOST-10-140-60-9: tasks 0-7: Exited with exit code 1 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/0_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/0_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index 5b394ee5218481e38c238ac571c303bf27b396c7..0000000000000000000000000000000000000000 --- a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/0_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl +++ /dev/null @@ -1,11 +0,0 @@ -{"question_id": 36, "question": "Which of the following images appears in a certain image of the above document?\nA. \nB. \nC. \nD. \nAnswer with the option's letter from the given choices directly.", "answer": 0, "response": "The", "context_length": 822747, "placed_depth": [0.44], "correct": false} -{"question_id": 120, "question": "Which of the following images appears in a certain image of the above document?\nA. \nB. \nC. \nD. \nAnswer with the option's letter from the given choices directly.", "answer": 3, "response": "I", "context_length": 862329, "placed_depth": [0.54], "correct": false} -{"question_id": 78, "question": "Which of the following images appears in a certain image of the above document?\nA. \nB. \nC. \nD. \nAnswer with the option's letter from the given choices directly.", "answer": 3, "response": "I", "context_length": 880451, "placed_depth": [0.89], "correct": false} -{"question_id": 102, "question": "Which of the following images appears in a certain image of the above document?\nA. \nB. \nC. \nD. \nAnswer with the option's letter from the given choices directly.", "answer": 2, "response": "I", "context_length": 886083, "placed_depth": [0.53], "correct": false} -{"question_id": 156, "question": "Which of the following images appears in a certain image of the above document?\nA. \nB. \nC. \nD. \nAnswer with the option's letter from the given choices directly.", "answer": 2, "response": "The", "context_length": 886099, "placed_depth": [0.24], "correct": false} -{"question_id": 6, "question": "Which of the following images appears in a certain image of the above document?\nA. \nB. \nC. \nD. \nAnswer with the option's letter from the given choices directly.", "answer": 0, "response": "The", "context_length": 904642, "placed_depth": [0.68], "correct": false} -{"question_id": 186, "question": "Which of the following images appears in a certain image of the above document?\nA. \nB. \nC. \nD. \nAnswer with the option's letter from the given choices directly.", "answer": 0, "response": "The", "context_length": 922533, "placed_depth": [0.81], "correct": false} -{"question_id": 180, "question": "Which of the following images appears in a certain image of the above document?\nA. \nB. \nC. \nD. \nAnswer with the option's letter from the given choices directly.", "answer": 2, "response": "The", "context_length": 926680, "placed_depth": [0.85], "correct": false} -{"question_id": 90, "question": "Which of the following images appears in a certain image of the above document?\nA. \nB. \nC. \nD. \nAnswer with the option's letter from the given choices directly.", "answer": 2, "response": "The", "context_length": 928807, "placed_depth": [0.25], "correct": false} -{"question_id": 24, "question": "Which of the following images appears in a certain image of the above document?\nA. \nB. \nC. \nD. \nAnswer with the option's letter from the given choices directly.", "answer": 1, "response": "And", "context_length": 931233, "placed_depth": [0.68], "correct": false} -{"question_id": 96, "question": "Which of the following images appears in a certain image of the above document?\nA. \nB. \nC. \nD. \nAnswer with the option's letter from the given choices directly.", "answer": 2, "response": "None", "context_length": 935097, "placed_depth": [0.46], "correct": false} diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/10_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/10_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/11_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/11_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/12_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/12_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/13_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/13_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/14_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/14_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/15_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/15_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/16_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/16_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/17_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/17_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/18_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/18_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/19_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/19_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/1_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/1_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/20_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/20_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/21_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/21_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/22_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/22_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/23_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/23_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/24_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/24_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/25_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/25_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/26_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/26_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/27_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/27_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/28_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/28_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/29_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/29_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/2_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/2_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/30_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/30_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/31_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/31_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/3_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/3_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/4_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/4_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/5_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/5_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/6_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/6_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/7_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/7_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/8_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/8_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/9_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M/temp_InternVL2-2B_retrieval-image-test-long-1M/9_32_InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M_stride_256.log b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M_stride_256.log deleted file mode 100644 index 6b5590f827082079b4f3094d0e2186de8f1e8696..0000000000000000000000000000000000000000 --- a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M_stride_256.log +++ /dev/null @@ -1,540 +0,0 @@ -[2024-11-14 12:34:56,746] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:56,747] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:56,747] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:56,747] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:56,747] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:56,747] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:56,747] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:56,747] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:57,556] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:57,556] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:57,556] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:57,556] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:57,556] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:57,556] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:57,556] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:57,556] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:57,618] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:57,618] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:57,618] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:57,618] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:57,618] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:57,618] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:57,618] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:57,618] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:59,796] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:59,796] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:59,796] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:59,796] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:59,796] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:59,796] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:59,796] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2024-11-14 12:34:59,796] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) -args.rope_pos_id_version='v5' -args.ring_attn=True -args.rope_pos_id_version='v5' -args.ring_attn=True -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -args.rope_pos_id_version='v5' -args.ring_attn=True -args.rope_pos_id_version='v5' -args.ring_attn=True -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -args.rope_pos_id_version='v5' -args.ring_attn=True -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -args.rope_pos_id_version='v5' -args.ring_attn=True -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -args.rope_pos_id_version='v5' -args.ring_attn=True -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -Start evaluation on task retrieval-image-test-long-1M -Start evaluation on task retrieval-image-test-long-1M -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -Start evaluation on task retrieval-image-test-long-1M -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -Start evaluation on task retrieval-image-test-long-1M -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -Start evaluation on task retrieval-image-test-long-1M -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -Start evaluation on task retrieval-image-test-long-1M -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -Start evaluation on task retrieval-image-test-long-1M -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -args.rope_pos_id_version='v5' -args.ring_attn=True -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -Start evaluation on task retrieval-image-test-long-1M -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -args.rope_pos_id_version='v5' -args.ring_attn=True -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -args.rope_pos_id_version='v5' -args.ring_attn=True -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -args.rope_pos_id_version='v5' -args.ring_attn=True -args.rope_pos_id_version='v5' -args.ring_attn=True -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -args.rope_pos_id_version='v5' -args.ring_attn=True -args.rope_pos_id_version='v5' -args.ring_attn=True -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -args.rope_pos_id_version='v5' -args.ring_attn=True -args.rope_pos_id_version='v5' -args.ring_attn=True -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -Start evaluation on task retrieval-image-test-long-1M -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -Start evaluation on task retrieval-image-test-long-1M -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -Start evaluation on task retrieval-image-test-long-1M -Start evaluation on task retrieval-image-test-long-1M -Start evaluation on task retrieval-image-test-long-1M -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -Start evaluation on task retrieval-image-test-long-1M -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -Start evaluation on task retrieval-image-test-long-1M -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -Start evaluation on task retrieval-image-test-long-1M -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -args.rope_pos_id_version='v5' -args.ring_attn=True -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -args.rope_pos_id_version='v5' -args.ring_attn=True -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -args.rope_pos_id_version='v5' -args.ring_attn=True -args.rope_pos_id_version='v5' -args.ring_attn=True -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -args.rope_pos_id_version='v5' -args.ring_attn=True -args.rope_pos_id_version='v5' -args.ring_attn=True -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -args.rope_pos_id_version='v5' -args.ring_attn=True -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -Start evaluation on task retrieval-image-test-long-1M -Start evaluation on task retrieval-image-test-long-1M -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -Start evaluation on task retrieval-image-test-long-1M -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -Start evaluation on task retrieval-image-test-long-1M -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -Start evaluation on task retrieval-image-test-long-1M -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -Start evaluation on task retrieval-image-test-long-1M -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -Start evaluation on task retrieval-image-test-long-1M -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -device_map={'': 2} -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -device_map={'': 3} -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -device_map={'': 6} -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -device_map={'': 7} -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -device_map={'': 0} -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -device_map={'': 4} -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -device_map={'': 2} -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -device_map={'': 4} -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -device_map={'': 6} -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -device_map={'': 1} -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -device_map={'': 3} -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -device_map={'': 5} -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -device_map={'': 1} -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -device_map={'': 3} -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -device_map={'': 6} -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -device_map={'': 1} -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -device_map={'': 7} -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -device_map={'': 4} -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -device_map={'': 5} -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -device_map={'': 0} -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -device_map={'': 2} -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -device_map={'': 5} -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -args.rope_pos_id_version='v5' -args.ring_attn=True -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -Start evaluation on task retrieval-image-test-long-1M -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -device_map={'': 7} -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -args.rope_pos_id_version='v5' -args.ring_attn=True -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -args.rope_pos_id_version='v5' -args.ring_attn=True -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -args.rope_pos_id_version='v5' -args.ring_attn=True -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -args.rope_pos_id_version='v5' -args.ring_attn=True -args.rope_pos_id_version='v5' -args.ring_attn=True -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -args.rope_pos_id_version='v5' -args.ring_attn=True -args.rope_pos_id_version='v5' -args.ring_attn=True -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -args.rope_pos_id_version='v5' -args.ring_attn=True -args=Namespace(checkpoint='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B', task='retrieval-image-test-long-1M', outputs_dir='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B/eval_mm_niah_ring_attn_256/retrieval-image-test-long-1M', num_gpus_per_rank=1, image_folder='', question_file='', rope_pos_id_version='v5', rope_factor=None, rope_pos_id_stride=256, interp='None', factor=1, ring_attn=True) -Start evaluation on task retrieval-image-test-long-1M -Start evaluation on task retrieval-image-test-long-1M -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -Start evaluation on task retrieval-image-test-long-1M -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -Start evaluation on task retrieval-image-test-long-1M -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -Start evaluation on task retrieval-image-test-long-1M -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -Start evaluation on task retrieval-image-test-long-1M -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -Start evaluation on task retrieval-image-test-long-1M -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -Start evaluation on task retrieval-image-test-long-1M -args.image_folder='', args.question_file='/mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/dataset/niah32k/test_image_retrieval_800_1200_sub200.jsonl' -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -device_map={'': 0} -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -device_map={'': 4} -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -device_map={'': 6} -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -device_map={'': 5} -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -device_map={'': 3} -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -device_map={'': 7} -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -device_map={'': 2} -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -device_map={'': 1} -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -Replace INTERNLM2_ATTENTION_CLASSES to support packed training!! -device_map={'': 0} - 0 -The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. -Rank [5] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=5)} -Rank [9] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=1)} -Rank [1] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=1)} -Rank [4] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=4)} -Rank [2] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=2)} -Rank [7] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=7)} -Rank [3] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=3)} -Rank [6] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=6)} -Rank [0] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=0)} -Rank [15] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=7)} -Rank [12] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=4)} -Rank [10] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=2)} -Rank [11] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=3)} -Rank [14] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=6)} -Rank [8] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=0)} -Rank [13] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=5)} -Rank [24] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=0)} -Rank [25] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=1)} -Rank [27] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=3)} -Rank [30] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=6)} -Rank [31] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=7)} -Rank [28] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=4)} -Rank [29] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=5)} -Rank [26] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=2)} -Rank [18] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=2)} -Rank [21] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=5)} -Rank [17] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=1)} -Rank [23] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=7)} -Rank [20] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=4)} -Rank [16] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=0)} -Rank [22] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=6)} -Rank [19] Begin to eval model /mnt/petrelfs/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/InternVL2-2B on task retrieval-image-test-long-1M, devices: {device(type='cuda', index=3)} -USE rope_pos_id_stride=256 -Rank 21 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 18 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 17 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 16 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 23 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 20 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 22 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 19 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 14 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 10 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 9 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 8 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 11 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 15 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 26 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 27 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 28 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 25 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 24 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 31 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 29 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 30 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 12 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 13 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 1 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 2 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 4 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 7 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 5 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 6 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 3 len(skip_idx)=0 -USE rope_pos_id_stride=256 -Rank 0 len(skip_idx)=6 -dynamic ViT batch size: 3296, images per sample: 3296.0, dynamic token length: 938944 -/mnt/petrelfs/wangweiyun/miniconda3/envs/internvl_gjq/lib/python3.10/site-packages/torch/autograd/function.py:539: UserWarning: 0NCCL_AVOID_RECORD_STREAMS=1 has no effect for point-to-point collectives. (Triggered internally at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1849.) - return super().apply(*args, **kwargs) # type: ignore[misc] -/mnt/petrelfs/wangweiyun/miniconda3/envs/internvl_gjq/lib/python3.10/site-packages/torch/autograd/function.py:539: UserWarning: 0NCCL_AVOID_RECORD_STREAMS=1 has no effect for point-to-point collectives. (Triggered internally at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1849.) - return super().apply(*args, **kwargs) # type: ignore[misc] -/mnt/petrelfs/wangweiyun/miniconda3/envs/internvl_gjq/lib/python3.10/site-packages/torch/autograd/function.py:539: UserWarning: 0NCCL_AVOID_RECORD_STREAMS=1 has no effect for point-to-point collectives. (Triggered internally at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1849.) - return super().apply(*args, **kwargs) # type: ignore[misc] -/mnt/petrelfs/wangweiyun/miniconda3/envs/internvl_gjq/lib/python3.10/site-packages/torch/autograd/function.py:539: UserWarning: 0NCCL_AVOID_RECORD_STREAMS=1 has no effect for point-to-point collectives. (Triggered internally at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1849.) - return super().apply(*args, **kwargs) # type: ignore[misc] -/mnt/petrelfs/wangweiyun/miniconda3/envs/internvl_gjq/lib/python3.10/site-packages/torch/autograd/function.py:539: UserWarning: 0NCCL_AVOID_RECORD_STREAMS=1 has no effect for point-to-point collectives. (Triggered internally at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1849.) - return super().apply(*args, **kwargs) # type: ignore[misc] -/mnt/petrelfs/wangweiyun/miniconda3/envs/internvl_gjq/lib/python3.10/site-packages/torch/autograd/function.py:539: UserWarning: 0NCCL_AVOID_RECORD_STREAMS=1 has no effect for point-to-point collectives. (Triggered internally at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1849.) - return super().apply(*args, **kwargs) # type: ignore[misc] -/mnt/petrelfs/wangweiyun/miniconda3/envs/internvl_gjq/lib/python3.10/site-packages/torch/autograd/function.py:539: UserWarning: 0NCCL_AVOID_RECORD_STREAMS=1 has no effect for point-to-point collectives. (Triggered internally at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1849.) - return super().apply(*args, **kwargs) # type: ignore[misc] -/mnt/petrelfs/wangweiyun/miniconda3/envs/internvl_gjq/lib/python3.10/site-packages/torch/autograd/function.py:539: UserWarning: 0NCCL_AVOID_RECORD_STREAMS=1 has no effect for point-to-point collectives. (Triggered internally at ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1849.) - return super().apply(*args, **kwargs) # type: ignore[misc] - Processing InternVL2-2B_retrieval-image-test-long-1M_ring_attn.jsonl: 0%| | 0/34 [00:00 - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from transformers.integrations.tpu import tpu_spmd_dataloader -ModuleNotFoundError: No module named 'transformers.integrations.tpu' -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in - from transformers.integrations.tpu import tpu_spmd_dataloader -ModuleNotFoundError: No module named 'transformers.integrations.tpu' - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from transformers.integrations.tpu import tpu_spmd_dataloader -ModuleNotFoundError: No module named 'transformers.integrations.tpu' -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in - from transformers.integrations.tpu import tpu_spmd_dataloader -ModuleNotFoundError: No module named 'transformers.integrations.tpu' - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from transformers.integrations.tpu import tpu_spmd_dataloader -ModuleNotFoundError: No module named 'transformers.integrations.tpu' -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in - from transformers.integrations.tpu import tpu_spmd_dataloader -ModuleNotFoundError: No module named 'transformers.integrations.tpu' - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from transformers.integrations.tpu import tpu_spmd_dataloader -ModuleNotFoundError: No module named 'transformers.integrations.tpu' -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from transformers.integrations.tpu import tpu_spmd_dataloader -ModuleNotFoundError: No module named 'transformers.integrations.tpu' -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from transformers.integrations.tpu import tpu_spmd_dataloader -ModuleNotFoundError: No module named 'transformers.integrations.tpu' -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in - from transformers.integrations.tpu import tpu_spmd_dataloader -ModuleNotFoundError: No module named 'transformers.integrations.tpu' - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from transformers.integrations.tpu import tpu_spmd_dataloader -ModuleNotFoundError: No module named 'transformers.integrations.tpu' -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from transformers.integrations.tpu import tpu_spmd_dataloader -ModuleNotFoundError: No module named 'transformers.integrations.tpu' -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from transformers.integrations.tpu import tpu_spmd_dataloader -ModuleNotFoundError: No module named 'transformers.integrations.tpu' -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in - from transformers.integrations.tpu import tpu_spmd_dataloader -ModuleNotFoundError: No module named 'transformers.integrations.tpu' - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from transformers.integrations.tpu import tpu_spmd_dataloader -ModuleNotFoundError: No module named 'transformers.integrations.tpu' -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from transformers.integrations.tpu import tpu_spmd_dataloader -ModuleNotFoundError: No module named 'transformers.integrations.tpu' -srun: error: HOST-10-140-66-136: tasks 0-5,7: Exited with exit code 1 -srun: error: HOST-10-140-66-136: task 6: Exited with exit code 1 -srun: error: HOST-10-140-66-137: tasks 8-9,11-15: Exited with exit code 1 -srun: error: HOST-10-140-66-137: task 10: Exited with exit code 1 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/0_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/0_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl deleted file mode 100644 index 9336e5ae5b9a867509fe4d63bcdacb1dc9aca157..0000000000000000000000000000000000000000 --- a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/0_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl +++ /dev/null @@ -1 +0,0 @@ -{"question_id": 84, "question": "Which of the following images appears in a certain image of the above document?\nA. \nB. \nC. \nD. \nAnswer with the option's letter from the given choices directly.", "answer": 3, "response": "The", "context_length": 527794, "placed_depth": [0.49], "correct": false} diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/10_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/10_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/11_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/11_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/12_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/12_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/13_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/13_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/14_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/14_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/15_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/15_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/1_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/1_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/2_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/2_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/3_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/3_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/4_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/4_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/5_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/5_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/6_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/6_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/7_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/7_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/8_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/8_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/9_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k/temp_InternVL2-2B_retrieval-image-test-long-800k/9_16_InternVL2-2B_retrieval-image-test-long-800k_ring_attn.jsonl deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k_stride_256.log b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k_stride_256.log deleted file mode 100644 index 641278875327fb190c4e96028c1bb5cfbec29e53..0000000000000000000000000000000000000000 --- a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-800k_stride_256.log +++ /dev/null @@ -1,196 +0,0 @@ -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in - from internvl.train.compress_seq_trainer import chunk_with_boundaries - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from transformers.integrations.tpu import tpu_spmd_dataloader - from transformers.integrations.tpu import tpu_spmd_dataloader - from transformers.integrations.tpu import tpu_spmd_dataloader -ModuleNotFoundError: No module named 'transformers.integrations.tpu' - from transformers.integrations.tpu import tpu_spmd_dataloader -ModuleNotFoundError: No module named 'transformers.integrations.tpu' - from transformers.integrations.tpu import tpu_spmd_dataloader -ModuleNotFoundError: No module named 'transformers.integrations.tpu' - from transformers.integrations.tpu import tpu_spmd_dataloader -ModuleNotFoundError: No module named 'transformers.integrations.tpu' - from transformers.integrations.tpu import tpu_spmd_dataloader -ModuleNotFoundError: No module named 'transformers.integrations.tpu' - from transformers.integrations.tpu import tpu_spmd_dataloader -ModuleNotFoundError: No module named 'transformers.integrations.tpu' -ModuleNotFoundError: No module named 'transformers.integrations.tpu' -ModuleNotFoundError: No module named 'transformers.integrations.tpu' -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from internvl.model.internvl_chat import InternVLChatModel - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from internvl.train.compress_seq_trainer import chunk_with_boundaries - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from transformers.integrations.tpu import tpu_spmd_dataloader - from transformers.integrations.tpu import tpu_spmd_dataloader - from transformers.integrations.tpu import tpu_spmd_dataloader -ModuleNotFoundError: No module named 'transformers.integrations.tpu' - from transformers.integrations.tpu import tpu_spmd_dataloader -ModuleNotFoundError: No module named 'transformers.integrations.tpu' - from transformers.integrations.tpu import tpu_spmd_dataloader -ModuleNotFoundError: No module named 'transformers.integrations.tpu' -ModuleNotFoundError: No module named 'transformers.integrations.tpu' - from transformers.integrations.tpu import tpu_spmd_dataloader -ModuleNotFoundError: No module named 'transformers.integrations.tpu' -ModuleNotFoundError: No module named 'transformers.integrations.tpu' -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from transformers.integrations.tpu import tpu_spmd_dataloader -ModuleNotFoundError: No module named 'transformers.integrations.tpu' -Traceback (most recent call last): - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 11, in - from internvl.model.internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/__init__.py", line 10, in - from .modeling_internvl_chat import InternVLChatModel - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internvl_chat/modeling_internvl_chat.py", line 13, in - from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/model/internlm2/modeling_internlm2.py", line 39, in - from internvl.train.compress_seq_trainer import chunk_with_boundaries - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/internvl/train/compress_seq_trainer.py", line 4, in - from transformers.integrations.tpu import tpu_spmd_dataloader -ModuleNotFoundError: No module named 'transformers.integrations.tpu' -srun: error: HOST-10-140-66-136: tasks 0,2-5: Exited with exit code 1 -srun: error: HOST-10-140-66-137: tasks 12,14: Exited with exit code 1 -srun: error: HOST-10-140-66-136: tasks 1,6-7: Exited with exit code 1 -srun: error: HOST-10-140-66-137: tasks 8-11,13,15: Exited with exit code 1 diff --git a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-subset_stride_256.log b/eval_mm_niah_ring_attn_256/retrieval-image-test-long-subset_stride_256.log deleted file mode 100644 index 82367c6e3c15b46aac711664903c146fedcd5bc9..0000000000000000000000000000000000000000 --- a/eval_mm_niah_ring_attn_256/retrieval-image-test-long-subset_stride_256.log +++ /dev/null @@ -1,33 +0,0 @@ - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 184 - if args.interp=='linear' - ^ -SyntaxError: expected ':' - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 184 - if args.interp=='linear' - ^ -SyntaxError: expected ':' - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 184 - if args.interp=='linear' - ^ -SyntaxError: expected ':' - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 184 - if args.interp=='linear' - ^ -SyntaxError: expected ':' - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 184 - if args.interp=='linear' - ^ -SyntaxError: expected ':' - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 184 - if args.interp=='linear' - ^ -SyntaxError: expected ':' - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 184 - if args.interp=='linear' - ^ -SyntaxError: expected ':' - File "/mnt/hwfile/wangweiyun/workspace_gjq/VLM-Dev/VLM-Dev2/VLM-Dev/eval/mm_niah/eval_mm_niah_long.py", line 184 - if args.interp=='linear' - ^ -SyntaxError: expected ':' -srun: error: HOST-10-140-60-9: tasks 0-7: Exited with exit code 1 diff --git a/eval_mm_niah_ring_attn_256/test_long_niah.sh b/eval_mm_niah_ring_attn_256/test_long_niah.sh deleted file mode 100644 index d5bc76d70af340d8d18711e1645dbf1aa56076e6..0000000000000000000000000000000000000000 --- a/eval_mm_niah_ring_attn_256/test_long_niah.sh +++ /dev/null @@ -1,77 +0,0 @@ -PARTITION=${PARTITION:-"Intern5"} -GPUS=${GPUS:-16} -GPUS_PER_NODE=${GPUS_PER_NODE:-8} -GPUS_PER_TASK=${GPUS_PER_TASK:-1} -QUOTA_TYPE=${QUOTA_TYPE:-"reserved"} - - -set -x - -CHECKPOINT=${1} - -JOB_FOLDER=$(dirname "$CHECKPOINT") -files=( - "$JOB_FOLDER/configuration_intern_vit.py" - "$JOB_FOLDER/configuration_internlm2.py" - "$JOB_FOLDER/configuration_internvl_chat.py" - "$JOB_FOLDER/conversation.py" - "$JOB_FOLDER/modeling_intern_vit.py" - "$JOB_FOLDER/modeling_internlm2.py" - "$JOB_FOLDER/modeling_internvl_chat.py" - "$JOB_FOLDER/tokenization_internlm2_fast.py" - "$JOB_FOLDER/tokenization_internlm2.py" - "test_long_niah.sh" -) -for file in "${files[@]}"; do - dest_file="$CHECKPOINT/$(basename "$file")" - if [ ! -f "$dest_file" ]; then - cp "$file" "$CHECKPOINT" - fi -done -ARGS=("$@") - -declare -a tasks=( \ -'retrieval-image-test-long-subset' \ -'retrieval-image-test-long-128k' \ -) - -declare -a tasks=(\ - 'retrieval-image-test-long-1M'\ -) -declare -a tasks=(\ - 'retrieval-image-test-long-800k' \ - 'retrieval-image-test-long-512k' \ -) - -model_name="internvl" -for STRIDE in 256; do - LOG_DIR=$CHECKPOINT/eval_mm_niah_ring_attn_$STRIDE - mkdir -p $LOG_DIR - - for ((j=0; j<${#tasks[@]}; j++)); do - task=${tasks[j]} - - echo "$(date) ${model_name}_${task}_stride_${STRIDE}" - - srun -p ${PARTITION} \ - --gres=gpu:${GPUS_PER_NODE} \ - --ntasks=$((GPUS / GPUS_PER_TASK)) \ - --ntasks-per-node=$((GPUS_PER_NODE / GPUS_PER_TASK)) \ - --quotatype=${QUOTA_TYPE} \ - --job-name="${STRIDE}${task}" \ - -o "${LOG_DIR}/${task}_stride_${STRIDE}.log" \ - -e "${LOG_DIR}/${task}_stride_${STRIDE}.log" \ - --async \ - python -u eval/mm_niah/eval_mm_niah_long.py \ - --checkpoint $CHECKPOINT \ - --outputs-dir $LOG_DIR \ - --task $task \ - --num-gpus-per-rank ${GPUS_PER_TASK} "${ARGS[@]:1}" \ - --rope_pos_id_version 'v5' \ - --ring_attn \ - --rope_pos_id_stride $STRIDE \ - - - sleep 0.2 - done -done \ No newline at end of file diff --git a/eval_mm_niah_ring_attn_256/tokenization_internlm2.py b/eval_mm_niah_ring_attn_256/tokenization_internlm2.py deleted file mode 100644 index 1be581da37ef678de65f2737493fc0ed7160446e..0000000000000000000000000000000000000000 --- a/eval_mm_niah_ring_attn_256/tokenization_internlm2.py +++ /dev/null @@ -1,235 +0,0 @@ -# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved. -# -# This code is based on transformers/src/transformers/models/llama/tokenization_llama.py -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tokenization classes for InternLM.""" -import os -from shutil import copyfile -from typing import Any, Dict, List, Optional, Tuple - -import sentencepiece as spm -from transformers.tokenization_utils import PreTrainedTokenizer -from transformers.utils import logging - -logger = logging.get_logger(__name__) - -VOCAB_FILES_NAMES = {'vocab_file': './tokenizer.model'} - -PRETRAINED_VOCAB_FILES_MAP = {} - - -# Modified from transformers.model.llama.tokenization_llama.LlamaTokenizer -class InternLM2Tokenizer(PreTrainedTokenizer): - """ - Construct a InternLM2 tokenizer. Based on byte-level Byte-Pair-Encoding. - - Args: - vocab_file (`str`): - Path to the vocabulary file. - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - model_input_names = ['input_ids', 'attention_mask'] - _auto_class = 'AutoTokenizer' - - def __init__( - self, - vocab_file, - unk_token='', - bos_token='', - eos_token='', - pad_token='', - sp_model_kwargs: Optional[Dict[str, Any]] = None, - add_bos_token=True, - add_eos_token=False, - decode_with_prefix_space=False, - clean_up_tokenization_spaces=False, - **kwargs, - ): - self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs - self.vocab_file = vocab_file - self.add_bos_token = add_bos_token - self.add_eos_token = add_eos_token - self.decode_with_prefix_space = decode_with_prefix_space - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(vocab_file) - self._no_prefix_space_tokens = None - super().__init__( - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - pad_token=pad_token, - clean_up_tokenization_spaces=clean_up_tokenization_spaces, - **kwargs, - ) - - @property - def no_prefix_space_tokens(self): - if self._no_prefix_space_tokens is None: - vocab = self.convert_ids_to_tokens(list(range(self.vocab_size))) - self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith('▁')} - return self._no_prefix_space_tokens - - @property - def vocab_size(self): - """Returns vocab size""" - return self.sp_model.get_piece_size() - - @property - def bos_token_id(self) -> Optional[int]: - return self.sp_model.bos_id() - - @property - def eos_token_id(self) -> Optional[int]: - return self.sp_model.eos_id() - - def get_vocab(self): - """Returns vocab as a dict""" - vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} - vocab.update(self.added_tokens_encoder) - return vocab - - def _tokenize(self, text): - """Returns a tokenized string.""" - return self.sp_model.encode(text, out_type=str) - - def _convert_token_to_id(self, token): - """Converts a token (str) in an id using the vocab.""" - return self.sp_model.piece_to_id(token) - - def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (str) using the vocab.""" - token = self.sp_model.IdToPiece(index) - return token - - def _maybe_add_prefix_space(self, tokens, decoded): - if tokens and tokens[0] not in self.no_prefix_space_tokens: - return ' ' + decoded - else: - return decoded - - def convert_tokens_to_string(self, tokens): - """Converts a sequence of tokens (string) in a single string.""" - current_sub_tokens = [] - out_string = '' - prev_is_special = False - for token in tokens: - # make sure that special tokens are not decoded using sentencepiece model - if token in self.all_special_tokens: - if not prev_is_special: - out_string += ' ' - out_string += self.sp_model.decode(current_sub_tokens) + token - prev_is_special = True - current_sub_tokens = [] - else: - current_sub_tokens.append(token) - prev_is_special = False - out_string += self.sp_model.decode(current_sub_tokens) - out_string = self.clean_up_tokenization(out_string) - out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string) - return out_string[1:] - - def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]: - """ - Save the vocabulary and special tokens file to a directory. - - Args: - save_directory (`str`): - The directory in which to save the vocabulary. - - Returns: - `Tuple(str)`: Paths to the files saved. - """ - if not os.path.isdir(save_directory): - logger.error(f'Vocabulary path ({save_directory}) should be a directory') - return - out_vocab_file = os.path.join( - save_directory, (filename_prefix + '-' if filename_prefix else '') + VOCAB_FILES_NAMES['vocab_file'] - ) - - if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): - copyfile(self.vocab_file, out_vocab_file) - elif not os.path.isfile(self.vocab_file): - with open(out_vocab_file, 'wb') as fi: - content_spiece_model = self.sp_model.serialized_model_proto() - fi.write(content_spiece_model) - - return (out_vocab_file,) - - def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): - if self.add_bos_token: - bos_token_ids = [self.bos_token_id] - else: - bos_token_ids = [] - - output = bos_token_ids + token_ids_0 - - if token_ids_1 is not None: - output = output + token_ids_1 - - if self.add_eos_token: - output = output + [self.eos_token_id] - - return output - - def get_special_tokens_mask( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False - ) -> List[int]: - """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer `prepare_for_model` method. - - Args: - token_ids_0 (`List[int]`): - List of IDs. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (`bool`, *optional*, defaults to `False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - if already_has_special_tokens: - return super().get_special_tokens_mask( - token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True - ) - - if token_ids_1 is None: - return [1] + ([0] * len(token_ids_0)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] - - def create_token_type_ids_from_sequences( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None - ) -> List[int]: - """ - Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make - use of token type ids, therefore a list of zeros is returned. - - Args: - token_ids_0 (`List[int]`): - List of IDs. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `List[int]`: List of zeros. - """ - eos = [self.eos_token_id] - - if token_ids_1 is None: - return len(token_ids_0 + eos) * [0] - return len(token_ids_0 + eos + token_ids_1 + eos) * [0] diff --git a/eval_mm_niah_ring_attn_256/tokenization_internlm2_fast.py b/eval_mm_niah_ring_attn_256/tokenization_internlm2_fast.py deleted file mode 100644 index aa0fccbd0f1d029d79e19821f2edcb01b594537c..0000000000000000000000000000000000000000 --- a/eval_mm_niah_ring_attn_256/tokenization_internlm2_fast.py +++ /dev/null @@ -1,211 +0,0 @@ -# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved. -# -# This code is based on transformers/src/transformers/models/llama/tokenization_llama_fast.py -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tokenization Fast class for InternLM.""" -import os -from shutil import copyfile -from typing import Any, Dict, Optional, Tuple - -from tokenizers import Tokenizer, decoders, normalizers, processors -from tokenizers.models import BPE -from transformers.convert_slow_tokenizer import (SLOW_TO_FAST_CONVERTERS, - SentencePieceExtractor, - SpmConverter) -from transformers.tokenization_utils_fast import PreTrainedTokenizerFast -from transformers.utils import logging - -from .tokenization_internlm2 import InternLM2Tokenizer - -logger = logging.get_logger(__name__) - -VOCAB_FILES_NAMES = {'vocab_file': './tokenizer.model'} - - -# Modified from transformers.convert_slow_tokenizer.LlamaConverter -class InternLM2Converter(SpmConverter): - handle_byte_fallback = True - - def vocab(self, proto): - vocab = [ - ('', 0.0), - ('', 0.0), - ('', 0.0), - ] - vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]] - return vocab - - def unk_id(self, proto): - unk_id = 0 - return unk_id - - def decoder(self, replacement, add_prefix_space): - return decoders.Sequence( - [ - decoders.Replace('▁', ' '), - decoders.ByteFallback(), - decoders.Fuse(), - decoders.Strip(content=' ', left=1), - ] - ) - - def tokenizer(self, proto): - model_type = proto.trainer_spec.model_type - vocab_scores = self.vocab(proto) - # special tokens - added_tokens = self.original_tokenizer.added_tokens_decoder - for i in range(len(vocab_scores)): - piece, score = vocab_scores[i] - if i in added_tokens: - vocab_scores[i] = (added_tokens[i].content, score) - if model_type == 1: - raise RuntimeError('InternLM2 is supposed to be a BPE model!') - - elif model_type == 2: - _, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores) - bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)} - tokenizer = Tokenizer( - BPE(bpe_vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True, byte_fallback=True) - ) - tokenizer.add_special_tokens( - [ added_token for index, added_token in added_tokens.items()] - ) - else: - raise Exception( - "You're trying to run a `Unigram` model but you're file was trained with a different algorithm" - ) - - return tokenizer - - def normalizer(self, proto): - normalizers_list = [] - if proto.normalizer_spec.add_dummy_prefix: - normalizers_list.append(normalizers.Prepend(prepend='▁')) - normalizers_list.append(normalizers.Replace(pattern=' ', content='▁')) - return normalizers.Sequence(normalizers_list) - - def pre_tokenizer(self, replacement, add_prefix_space): - return None - - -SLOW_TO_FAST_CONVERTERS['InternLM2Tokenizer'] = InternLM2Converter - - -# Modified from transformers.model.llama.tokenization_llama_fast.LlamaTokenizerFast -> InternLM2TokenizerFast -class InternLM2TokenizerFast(PreTrainedTokenizerFast): - vocab_files_names = VOCAB_FILES_NAMES - slow_tokenizer_class = InternLM2Tokenizer - padding_side = 'left' - model_input_names = ['input_ids', 'attention_mask'] - _auto_class = 'AutoTokenizer' - - def __init__( - self, - vocab_file, - unk_token='', - bos_token='', - eos_token='', - pad_token='', - sp_model_kwargs: Optional[Dict[str, Any]] = None, - add_bos_token=True, - add_eos_token=False, - decode_with_prefix_space=False, - clean_up_tokenization_spaces=False, - **kwargs, - ): - super().__init__( - vocab_file=vocab_file, - unk_token=unk_token, - bos_token=bos_token, - eos_token=eos_token, - pad_token=pad_token, - sp_model_kwargs=sp_model_kwargs, - add_bos_token=add_bos_token, - add_eos_token=add_eos_token, - decode_with_prefix_space=decode_with_prefix_space, - clean_up_tokenization_spaces=clean_up_tokenization_spaces, - **kwargs, - ) - self._add_bos_token = add_bos_token - self._add_eos_token = add_eos_token - self.update_post_processor() - self.vocab_file = vocab_file - - @property - def can_save_slow_tokenizer(self) -> bool: - return os.path.isfile(self.vocab_file) if self.vocab_file else False - - def update_post_processor(self): - """ - Updates the underlying post processor with the current `bos_token` and `eos_token`. - """ - bos = self.bos_token - bos_token_id = self.bos_token_id - if bos is None and self.add_bos_token: - raise ValueError('add_bos_token = True but bos_token = None') - - eos = self.eos_token - eos_token_id = self.eos_token_id - if eos is None and self.add_eos_token: - raise ValueError('add_eos_token = True but eos_token = None') - - single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}" - pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}" - - special_tokens = [] - if self.add_bos_token: - special_tokens.append((bos, bos_token_id)) - if self.add_eos_token: - special_tokens.append((eos, eos_token_id)) - self._tokenizer.post_processor = processors.TemplateProcessing( - single=single, pair=pair, special_tokens=special_tokens - ) - - @property - def add_eos_token(self): - return self._add_eos_token - - @property - def add_bos_token(self): - return self._add_bos_token - - @add_eos_token.setter - def add_eos_token(self, value): - self._add_eos_token = value - self.update_post_processor() - - @add_bos_token.setter - def add_bos_token(self, value): - self._add_bos_token = value - self.update_post_processor() - - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: - if not self.can_save_slow_tokenizer: - raise ValueError( - 'Your fast tokenizer does not have the necessary information to save the vocabulary for a slow ' - 'tokenizer.' - ) - - if not os.path.isdir(save_directory): - logger.error(f'Vocabulary path ({save_directory}) should be a directory') - return - out_vocab_file = os.path.join( - save_directory, (filename_prefix + '-' if filename_prefix else '') + VOCAB_FILES_NAMES['vocab_file'] - ) - - if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): - copyfile(self.vocab_file, out_vocab_file) - - return (out_vocab_file,)