czczup commited on
Commit
fd7963e
·
verified ·
1 Parent(s): 32948f9

Update model code

Browse files
InternVL2-8B-Pretrain/configuration_internvl_chat.py CHANGED
@@ -64,6 +64,8 @@ class InternVLChatConfig(PretrainedConfig):
64
  self.ps_version = ps_version # pixel shuffle version
65
  self.min_dynamic_patch = min_dynamic_patch
66
  self.max_dynamic_patch = max_dynamic_patch
 
 
67
 
68
  logger.info(f'vision_select_layer: {self.select_layer}')
69
  logger.info(f'ps_version: {self.ps_version}')
 
64
  self.ps_version = ps_version # pixel shuffle version
65
  self.min_dynamic_patch = min_dynamic_patch
66
  self.max_dynamic_patch = max_dynamic_patch
67
+ # By default, we use tie_word_embeddings=False for models of all sizes.
68
+ self.tie_word_embeddings = self.llm_config.tie_word_embeddings
69
 
70
  logger.info(f'vision_select_layer: {self.select_layer}')
71
  logger.info(f'ps_version: {self.ps_version}')
InternVL2-8B-Pretrain/modeling_intern_vit.py CHANGED
@@ -364,6 +364,7 @@ class InternVisionEncoder(nn.Module):
364
  class InternVisionModel(PreTrainedModel):
365
  main_input_name = 'pixel_values'
366
  _supports_flash_attn_2 = True
 
367
  config_class = InternVisionConfig
368
  _no_split_modules = ['InternVisionEncoderLayer']
369
 
 
364
  class InternVisionModel(PreTrainedModel):
365
  main_input_name = 'pixel_values'
366
  _supports_flash_attn_2 = True
367
+ supports_gradient_checkpointing = True
368
  config_class = InternVisionConfig
369
  _no_split_modules = ['InternVisionEncoderLayer']
370
 
InternVL2-8B-Pretrain/modeling_internvl_chat.py CHANGED
@@ -38,12 +38,13 @@ class InternVLChatModel(PreTrainedModel):
38
  main_input_name = 'pixel_values'
39
  base_model_prefix = 'language_model'
40
  _supports_flash_attn_2 = True
 
41
  _no_split_modules = ['InternVisionModel', 'LlamaDecoderLayer', 'InternLM2DecoderLayer']
42
 
43
  def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None, use_flash_attn=True):
44
  super().__init__(config)
45
 
46
- assert version_cmp(transformers.__version__, '4.36.2', 'ge')
47
  image_size = config.force_image_size or config.vision_config.image_size
48
  patch_size = config.vision_config.patch_size
49
  self.patch_size = patch_size
@@ -112,7 +113,7 @@ class InternVLChatModel(PreTrainedModel):
112
  B, N, C = input_embeds.shape
113
  input_embeds = input_embeds.reshape(B * N, C)
114
 
115
- if torch.distributed.get_rank() == 0:
116
  print(f'dynamic ViT batch size: {vit_batch_size}, images per sample: {vit_batch_size / B}, dynamic token length: {N}')
117
 
118
  input_ids = input_ids.reshape(B * N)
@@ -347,3 +348,13 @@ class InternVLChatModel(PreTrainedModel):
347
  )
348
 
349
  return outputs
 
 
 
 
 
 
 
 
 
 
 
38
  main_input_name = 'pixel_values'
39
  base_model_prefix = 'language_model'
40
  _supports_flash_attn_2 = True
41
+ supports_gradient_checkpointing = True
42
  _no_split_modules = ['InternVisionModel', 'LlamaDecoderLayer', 'InternLM2DecoderLayer']
43
 
44
  def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None, use_flash_attn=True):
45
  super().__init__(config)
46
 
47
+ assert version_cmp(transformers.__version__, '4.37.0', 'ge')
48
  image_size = config.force_image_size or config.vision_config.image_size
49
  patch_size = config.vision_config.patch_size
50
  self.patch_size = patch_size
 
113
  B, N, C = input_embeds.shape
114
  input_embeds = input_embeds.reshape(B * N, C)
115
 
116
+ if torch.distributed.is_initialized() and torch.distributed.get_rank() == 0:
117
  print(f'dynamic ViT batch size: {vit_batch_size}, images per sample: {vit_batch_size / B}, dynamic token length: {N}')
118
 
119
  input_ids = input_ids.reshape(B * N)
 
348
  )
349
 
350
  return outputs
351
+
352
+ @property
353
+ def lm_head(self):
354
+ return self.language_model.get_output_embeddings()
355
+
356
+ def get_input_embeddings(self):
357
+ return self.language_model.get_input_embeddings()
358
+
359
+ def get_output_embeddings(self):
360
+ return self.language_model.get_output_embeddings()