shepnerd commited on
Commit
1f40ad7
·
verified ·
1 Parent(s): 7fc435d

Update modeling_internvideo2.py

Browse files
Files changed (1) hide show
  1. modeling_internvideo2.py +4 -17
modeling_internvideo2.py CHANGED
@@ -1056,11 +1056,6 @@ def pretrain_internvideo2_1b_patch14_224(config):
1056
  clip_return_layer=config.vision_encoder.clip_return_layer,
1057
  clip_student_return_interval=config.vision_encoder.clip_student_return_interval,
1058
  )
1059
-
1060
- # if config.vision_encoder.pretrained is not None:
1061
- # state_dict = torch.load(config.vision_encoder.pretrained, map_location='cpu')
1062
- # interpolate_pos_embed_internvideo2(state_dict, model, orig_t_size=8)
1063
- # message = model.load_state_dict(state_dict, strict=False)
1064
 
1065
  return model
1066
 
@@ -1071,8 +1066,10 @@ def pretrain_internvideo2_6b_patch14_224(config):
1071
  embed_dim=3200, depth=48, num_heads=25, mlp_ratio=4,
1072
  clip_embed_dim=config.vision_encoder.clip_embed_dim,
1073
  attn_pool_num_heads=16, qkv_bias=False,
1074
- drop_path_rate=0.3,
1075
- init_values=0.00001,
 
 
1076
  qk_normalization=True,
1077
  use_flash_attn=config.vision_encoder.use_flash_attn,
1078
  use_fused_rmsnorm=config.vision_encoder.use_fused_rmsnorm,
@@ -1091,12 +1088,6 @@ def pretrain_internvideo2_6b_patch14_224(config):
1091
  clip_return_layer=config.vision_encoder.clip_return_layer,
1092
  clip_student_return_interval=config.vision_encoder.clip_student_return_interval,
1093
  )
1094
-
1095
- # if config.vision_encoder.pretrained is not None:
1096
-
1097
- # state_dict = torch.load(config.vision_encoder.pretrained, map_location='cpu')
1098
- # interpolate_pos_embed_internvideo2(state_dict, model, orig_t_size=8)
1099
- # msg = model.load_state_dict(state_dict, strict=False)
1100
 
1101
  return model
1102
 
@@ -3155,7 +3146,6 @@ class InternVideo2_Stage2(
3155
 
3156
  def __init__(self,
3157
  config: InternVideo2_Stage2_Config,
3158
- # tokenizer,
3159
  is_pretrain: bool=True):
3160
 
3161
  super(InternVideo2_Stage2, self).__init__(config)
@@ -3172,10 +3162,7 @@ class InternVideo2_Stage2(
3172
 
3173
  # create modules.
3174
  self.vision_encoder = self.build_vision_encoder()
3175
- self.freeze_vision()
3176
-
3177
  self.text_encoder = self.build_text_encoder()
3178
- self.freeze_text()
3179
 
3180
  self.vision_proj = nn.Linear(self.vision_width, self.embed_dim)
3181
  self.text_proj = nn.Linear(self.text_width, self.embed_dim)
 
1056
  clip_return_layer=config.vision_encoder.clip_return_layer,
1057
  clip_student_return_interval=config.vision_encoder.clip_student_return_interval,
1058
  )
 
 
 
 
 
1059
 
1060
  return model
1061
 
 
1066
  embed_dim=3200, depth=48, num_heads=25, mlp_ratio=4,
1067
  clip_embed_dim=config.vision_encoder.clip_embed_dim,
1068
  attn_pool_num_heads=16, qkv_bias=False,
1069
+ # drop_path_rate=0.3,
1070
+ # init_values=0.00001,
1071
+ drop_path_rate=0,
1072
+ init_values=None,
1073
  qk_normalization=True,
1074
  use_flash_attn=config.vision_encoder.use_flash_attn,
1075
  use_fused_rmsnorm=config.vision_encoder.use_fused_rmsnorm,
 
1088
  clip_return_layer=config.vision_encoder.clip_return_layer,
1089
  clip_student_return_interval=config.vision_encoder.clip_student_return_interval,
1090
  )
 
 
 
 
 
 
1091
 
1092
  return model
1093
 
 
3146
 
3147
  def __init__(self,
3148
  config: InternVideo2_Stage2_Config,
 
3149
  is_pretrain: bool=True):
3150
 
3151
  super(InternVideo2_Stage2, self).__init__(config)
 
3162
 
3163
  # create modules.
3164
  self.vision_encoder = self.build_vision_encoder()
 
 
3165
  self.text_encoder = self.build_text_encoder()
 
3166
 
3167
  self.vision_proj = nn.Linear(self.vision_width, self.embed_dim)
3168
  self.text_proj = nn.Linear(self.text_width, self.embed_dim)