Lolalb commited on
Commit
bf62f33
·
verified ·
1 Parent(s): aba41bc

Upload tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer.py +1 -23
  2. tokenizer_config.json +0 -9
tokenizer.py CHANGED
@@ -24,7 +24,6 @@ class ProteinTokenizer(PreTrainedTokenizerFast):
24
  unk_token_id: int,
25
  model_max_length: int,
26
  other_special_token_ids: Optional[List[int]] = None,
27
- ambiguous_token_ids: Optional[List[int]] = None, # str = "XBOUZJ"
28
  **kwargs,
29
  ):
30
  """Vocabulary comprising the amino acids, and the special tokens <unk>, <bos>, <eos>, <pad> and <mask>.
@@ -69,7 +68,6 @@ class ProteinTokenizer(PreTrainedTokenizerFast):
69
  unk_token_id=unk_token_id,
70
  unk_token=id_to_token.get(unk_token_id),
71
  other_special_token_ids=other_special_token_ids,
72
- ambiguous_token_ids=ambiguous_token_ids,
73
  model_input_names=["input_ids", "attention_mask", "special_tokens_mask"],
74
  tokenizer_object=tokenizer_object,
75
  )
@@ -77,8 +75,6 @@ class ProteinTokenizer(PreTrainedTokenizerFast):
77
  if other_special_token_ids is not None:
78
  self.add_special_tokens({"additional_special_tokens": list(id_to_token.get(i) for i in other_special_token_ids)})
79
 
80
- self.ambiguous_token_ids = ambiguous_token_ids
81
-
82
  self.key_to_padding = {"input_ids": self.pad_token_id, "attention_mask": 0, "special_tokens_mask": 1, "position_ids": 0}
83
  self.key_to_dtype = {
84
  "input_ids": torch.long,
@@ -130,7 +126,7 @@ class ProteinTokenizer(PreTrainedTokenizerFast):
130
  """
131
 
132
  for i, sequence in enumerate(encoded_inputs["input_ids"]):
133
- mask = [token not in self.ambiguous_token_ids for token in sequence]
134
  for key in encoded_inputs:
135
  encoded_inputs[key][i] = list(compress(encoded_inputs[key][i], mask))
136
  return encoded_inputs
@@ -143,15 +139,6 @@ class ProteinTokenizer(PreTrainedTokenizerFast):
143
  pad_to_multiple_of: int = 8,
144
  **kwargs,
145
  ) -> Dict[str, List[List[int]]]:
146
- """
147
- Remove ambiguous amino acids from the input sequences.
148
-
149
- Args:
150
- encoded_inputs (Dict[str, List[List[int]]): Tokenized inputs with keys like 'input_ids' as tensors.
151
-
152
- Returns:
153
- Dict[str, List[List[int]]]: Tokenized inputs without ambiguous amino acids.
154
- """
155
 
156
  if isinstance(encoded_inputs, list):
157
  tmp = dict()
@@ -185,15 +172,6 @@ class ProteinTokenizer(PreTrainedTokenizerFast):
185
  return_tensors: str = "pt",
186
  **kwargs,
187
  ) -> Dict[str, List[List[int]]]:
188
- """
189
- Remove ambiguous amino acids from the input sequences.
190
-
191
- Args:
192
- encoded_inputs (Dict[str, List[List[int]]): Tokenized inputs with keys like 'input_ids' as tensors.
193
-
194
- Returns:
195
- Dict[str, List[List[int]]]: Tokenized inputs without ambiguous amino acids.
196
- """
197
 
198
  encoded_inputs = self._pad(
199
  encoded_inputs,
 
24
  unk_token_id: int,
25
  model_max_length: int,
26
  other_special_token_ids: Optional[List[int]] = None,
 
27
  **kwargs,
28
  ):
29
  """Vocabulary comprising the amino acids, and the special tokens <unk>, <bos>, <eos>, <pad> and <mask>.
 
68
  unk_token_id=unk_token_id,
69
  unk_token=id_to_token.get(unk_token_id),
70
  other_special_token_ids=other_special_token_ids,
 
71
  model_input_names=["input_ids", "attention_mask", "special_tokens_mask"],
72
  tokenizer_object=tokenizer_object,
73
  )
 
75
  if other_special_token_ids is not None:
76
  self.add_special_tokens({"additional_special_tokens": list(id_to_token.get(i) for i in other_special_token_ids)})
77
 
 
 
78
  self.key_to_padding = {"input_ids": self.pad_token_id, "attention_mask": 0, "special_tokens_mask": 1, "position_ids": 0}
79
  self.key_to_dtype = {
80
  "input_ids": torch.long,
 
126
  """
127
 
128
  for i, sequence in enumerate(encoded_inputs["input_ids"]):
129
+ mask = [token_id != self.unk_token_id for token_id in sequence]
130
  for key in encoded_inputs:
131
  encoded_inputs[key][i] = list(compress(encoded_inputs[key][i], mask))
132
  return encoded_inputs
 
139
  pad_to_multiple_of: int = 8,
140
  **kwargs,
141
  ) -> Dict[str, List[List[int]]]:
 
 
 
 
 
 
 
 
 
142
 
143
  if isinstance(encoded_inputs, list):
144
  tmp = dict()
 
172
  return_tensors: str = "pt",
173
  **kwargs,
174
  ) -> Dict[str, List[List[int]]]:
 
 
 
 
 
 
 
 
 
175
 
176
  encoded_inputs = self._pad(
177
  encoded_inputs,
tokenizer_config.json CHANGED
@@ -41,15 +41,6 @@
41
  "special": true
42
  }
43
  },
44
- "ambiguous_token_ids": [
45
- 1,
46
- 6,
47
- 7,
48
- 8,
49
- 9,
50
- 10,
51
- 11
52
- ],
53
  "auto_map": {
54
  "AutoTokenizer": [
55
  "tokenizer.ProteinTokenizer",
 
41
  "special": true
42
  }
43
  },
 
 
 
 
 
 
 
 
 
44
  "auto_map": {
45
  "AutoTokenizer": [
46
  "tokenizer.ProteinTokenizer",