Add supports_gradient_checkpointing
Browse files- configuration_internvl_chat.py +2 -0
- modeling_intern_vit.py +1 -0
- modeling_internvl_chat.py +11 -0
configuration_internvl_chat.py
CHANGED
@@ -61,6 +61,8 @@ class InternVLChatConfig(PretrainedConfig):
|
|
61 |
self.ps_version = ps_version # pixel shuffle version
|
62 |
self.min_dynamic_patch = min_dynamic_patch
|
63 |
self.max_dynamic_patch = max_dynamic_patch
|
|
|
|
|
64 |
|
65 |
logger.info(f'vision_select_layer: {self.select_layer}')
|
66 |
logger.info(f'ps_version: {self.ps_version}')
|
|
|
61 |
self.ps_version = ps_version # pixel shuffle version
|
62 |
self.min_dynamic_patch = min_dynamic_patch
|
63 |
self.max_dynamic_patch = max_dynamic_patch
|
64 |
+
# By default, we use tie_word_embeddings=False for models of all sizes.
|
65 |
+
self.tie_word_embeddings = self.llm_config.tie_word_embeddings
|
66 |
|
67 |
logger.info(f'vision_select_layer: {self.select_layer}')
|
68 |
logger.info(f'ps_version: {self.ps_version}')
|
modeling_intern_vit.py
CHANGED
@@ -364,6 +364,7 @@ class InternVisionEncoder(nn.Module):
|
|
364 |
class InternVisionModel(PreTrainedModel):
|
365 |
main_input_name = 'pixel_values'
|
366 |
_supports_flash_attn_2 = True
|
|
|
367 |
config_class = InternVisionConfig
|
368 |
_no_split_modules = ['InternVisionEncoderLayer']
|
369 |
|
|
|
364 |
class InternVisionModel(PreTrainedModel):
|
365 |
main_input_name = 'pixel_values'
|
366 |
_supports_flash_attn_2 = True
|
367 |
+
supports_gradient_checkpointing = True
|
368 |
config_class = InternVisionConfig
|
369 |
_no_split_modules = ['InternVisionEncoderLayer']
|
370 |
|
modeling_internvl_chat.py
CHANGED
@@ -35,6 +35,7 @@ class InternVLChatModel(PreTrainedModel):
|
|
35 |
main_input_name = 'pixel_values'
|
36 |
base_model_prefix = 'language_model'
|
37 |
_supports_flash_attn_2 = True
|
|
|
38 |
_no_split_modules = ['InternVisionModel', 'LlamaDecoderLayer']
|
39 |
|
40 |
def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None, use_flash_attn=True):
|
@@ -344,3 +345,13 @@ class InternVLChatModel(PreTrainedModel):
|
|
344 |
)
|
345 |
|
346 |
return outputs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
main_input_name = 'pixel_values'
|
36 |
base_model_prefix = 'language_model'
|
37 |
_supports_flash_attn_2 = True
|
38 |
+
supports_gradient_checkpointing = True
|
39 |
_no_split_modules = ['InternVisionModel', 'LlamaDecoderLayer']
|
40 |
|
41 |
def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None, use_flash_attn=True):
|
|
|
345 |
)
|
346 |
|
347 |
return outputs
|
348 |
+
|
349 |
+
@property
|
350 |
+
def lm_head(self):
|
351 |
+
return self.language_model.get_output_embeddings()
|
352 |
+
|
353 |
+
def get_input_embeddings(self):
|
354 |
+
return self.language_model.get_input_embeddings()
|
355 |
+
|
356 |
+
def get_output_embeddings(self):
|
357 |
+
return self.language_model.get_output_embeddings()
|