Kaguya-19 commited on
Commit
225e1b1
·
verified ·
1 Parent(s): 11cfdf9

Update modeling_minicpm.py

Browse files
Files changed (1) hide show
  1. modeling_minicpm.py +49 -14
modeling_minicpm.py CHANGED
@@ -27,7 +27,7 @@ import torch.nn.functional as F
27
  import torch.utils.checkpoint
28
  from torch import nn
29
  from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
30
-
31
  from transformers.activations import ACT2FN
32
  from transformers.cache_utils import Cache, DynamicCache
33
  from transformers.modeling_attn_mask_utils import (
@@ -35,6 +35,7 @@ from transformers.modeling_attn_mask_utils import (
35
  _prepare_4d_attention_mask,
36
  _prepare_4d_causal_attention_mask,
37
  _prepare_4d_causal_attention_mask_for_sdpa,
 
38
  )
39
  from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
40
  from transformers.modeling_utils import PreTrainedModel
@@ -320,9 +321,6 @@ class MiniCPMAttention(nn.Module):
320
  self.rope_theta = config.rope_theta
321
 
322
  self.is_causal = config.is_causal
323
-
324
- logger.info(f"self.is_causal = {self.is_causal}")
325
-
326
 
327
  if (self.head_dim * self.num_heads) != self.hidden_size:
328
  raise ValueError(
@@ -1049,17 +1047,29 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
1049
  elif self._use_sdpa and not output_attentions:
1050
  # output_attentions=True can not be supported when using SDPA, and we fall back on
1051
  # the manual implementation that requires a 4D causal mask in all cases.
1052
- attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
1053
- attention_mask,
1054
- (batch_size, seq_length),
1055
- inputs_embeds,
1056
- past_key_values_length,
1057
- )
 
 
 
 
 
 
1058
  else:
1059
  # 4d mask is passed through the layers
1060
- attention_mask = _prepare_4d_causal_attention_mask(
1061
- attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
1062
- )
 
 
 
 
 
 
1063
 
1064
  # embed positions
1065
  hidden_states = inputs_embeds
@@ -1119,7 +1129,6 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
1119
  attentions=all_self_attns,
1120
  )
1121
 
1122
-
1123
  class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
1124
  _tied_weights_keys = ["lm_head.weight"]
1125
 
@@ -1335,6 +1344,32 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
1335
  return response, history
1336
 
1337
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1338
  @add_start_docstrings(
1339
  """
1340
  The MiniCPM Model transformer with a sequence classification head on top (linear layer).
 
27
  import torch.utils.checkpoint
28
  from torch import nn
29
  from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
30
+ from transformers import LlamaTokenizer
31
  from transformers.activations import ACT2FN
32
  from transformers.cache_utils import Cache, DynamicCache
33
  from transformers.modeling_attn_mask_utils import (
 
35
  _prepare_4d_attention_mask,
36
  _prepare_4d_causal_attention_mask,
37
  _prepare_4d_causal_attention_mask_for_sdpa,
38
+ _prepare_4d_attention_mask_for_sdpa,
39
  )
40
  from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
41
  from transformers.modeling_utils import PreTrainedModel
 
321
  self.rope_theta = config.rope_theta
322
 
323
  self.is_causal = config.is_causal
 
 
 
324
 
325
  if (self.head_dim * self.num_heads) != self.hidden_size:
326
  raise ValueError(
 
1047
  elif self._use_sdpa and not output_attentions:
1048
  # output_attentions=True can not be supported when using SDPA, and we fall back on
1049
  # the manual implementation that requires a 4D causal mask in all cases.
1050
+ if self.is_causal:
1051
+ attention_mask = _prepare_4d_causal_attention_mask_for_sdpa (
1052
+ attention_mask,
1053
+ (batch_size, seq_length),
1054
+ inputs_embeds,
1055
+ past_key_values_length,
1056
+ )
1057
+ else:
1058
+ attention_mask = _prepare_4d_attention_mask_for_sdpa(
1059
+ attention_mask,
1060
+ inputs_embeds.dtype,
1061
+ )
1062
  else:
1063
  # 4d mask is passed through the layers
1064
+ if self.is_causal:
1065
+ attention_mask = _prepare_4d_causal_attention_mask (
1066
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
1067
+ )
1068
+ else:
1069
+ attention_mask = _prepare_4d_attention_mask(
1070
+ attention_mask,
1071
+ inputs_embeds.dtype,
1072
+ )
1073
 
1074
  # embed positions
1075
  hidden_states = inputs_embeds
 
1129
  attentions=all_self_attns,
1130
  )
1131
 
 
1132
  class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
1133
  _tied_weights_keys = ["lm_head.weight"]
1134
 
 
1344
  return response, history
1345
 
1346
 
1347
+
1348
+
1349
+ class MiniCPMRerankerLLamaTokenizer(LlamaTokenizer):
1350
+ def build_inputs_with_special_tokens(
1351
+ self, token_ids_0, token_ids_1 = None
1352
+ ):
1353
+ """
1354
+ - single sequence: `<s> X </s>`
1355
+ - pair of sequences: `<s> A </s> B`
1356
+
1357
+ Args:
1358
+ token_ids_0 (`List[int]`):
1359
+ List of IDs to which the special tokens will be added.
1360
+ token_ids_1 (`List[int]`, *optional*):
1361
+ Optional second list of IDs for sequence pairs.
1362
+
1363
+ Returns:
1364
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
1365
+ """
1366
+
1367
+ if token_ids_1 is None:
1368
+ return super().build_inputs_with_special_tokens(token_ids_0)
1369
+ bos = [self.bos_token_id]
1370
+ sep = [self.eos_token_id]
1371
+ return bos + token_ids_0 + sep + token_ids_1
1372
+
1373
  @add_start_docstrings(
1374
  """
1375
  The MiniCPM Model transformer with a sequence classification head on top (linear layer).