Upload tokenizer
Browse files- tokenizer.py +1 -23
- tokenizer_config.json +0 -9
tokenizer.py
CHANGED
@@ -24,7 +24,6 @@ class ProteinTokenizer(PreTrainedTokenizerFast):
|
|
24 |
unk_token_id: int,
|
25 |
model_max_length: int,
|
26 |
other_special_token_ids: Optional[List[int]] = None,
|
27 |
-
ambiguous_token_ids: Optional[List[int]] = None, # str = "XBOUZJ"
|
28 |
**kwargs,
|
29 |
):
|
30 |
"""Vocabulary comprising the amino acids, and the special tokens <unk>, <bos>, <eos>, <pad> and <mask>.
|
@@ -69,7 +68,6 @@ class ProteinTokenizer(PreTrainedTokenizerFast):
|
|
69 |
unk_token_id=unk_token_id,
|
70 |
unk_token=id_to_token.get(unk_token_id),
|
71 |
other_special_token_ids=other_special_token_ids,
|
72 |
-
ambiguous_token_ids=ambiguous_token_ids,
|
73 |
model_input_names=["input_ids", "attention_mask", "special_tokens_mask"],
|
74 |
tokenizer_object=tokenizer_object,
|
75 |
)
|
@@ -77,8 +75,6 @@ class ProteinTokenizer(PreTrainedTokenizerFast):
|
|
77 |
if other_special_token_ids is not None:
|
78 |
self.add_special_tokens({"additional_special_tokens": list(id_to_token.get(i) for i in other_special_token_ids)})
|
79 |
|
80 |
-
self.ambiguous_token_ids = ambiguous_token_ids
|
81 |
-
|
82 |
self.key_to_padding = {"input_ids": self.pad_token_id, "attention_mask": 0, "special_tokens_mask": 1, "position_ids": 0}
|
83 |
self.key_to_dtype = {
|
84 |
"input_ids": torch.long,
|
@@ -130,7 +126,7 @@ class ProteinTokenizer(PreTrainedTokenizerFast):
|
|
130 |
"""
|
131 |
|
132 |
for i, sequence in enumerate(encoded_inputs["input_ids"]):
|
133 |
-
mask = [
|
134 |
for key in encoded_inputs:
|
135 |
encoded_inputs[key][i] = list(compress(encoded_inputs[key][i], mask))
|
136 |
return encoded_inputs
|
@@ -143,15 +139,6 @@ class ProteinTokenizer(PreTrainedTokenizerFast):
|
|
143 |
pad_to_multiple_of: int = 8,
|
144 |
**kwargs,
|
145 |
) -> Dict[str, List[List[int]]]:
|
146 |
-
"""
|
147 |
-
Remove ambiguous amino acids from the input sequences.
|
148 |
-
|
149 |
-
Args:
|
150 |
-
encoded_inputs (Dict[str, List[List[int]]): Tokenized inputs with keys like 'input_ids' as tensors.
|
151 |
-
|
152 |
-
Returns:
|
153 |
-
Dict[str, List[List[int]]]: Tokenized inputs without ambiguous amino acids.
|
154 |
-
"""
|
155 |
|
156 |
if isinstance(encoded_inputs, list):
|
157 |
tmp = dict()
|
@@ -185,15 +172,6 @@ class ProteinTokenizer(PreTrainedTokenizerFast):
|
|
185 |
return_tensors: str = "pt",
|
186 |
**kwargs,
|
187 |
) -> Dict[str, List[List[int]]]:
|
188 |
-
"""
|
189 |
-
Remove ambiguous amino acids from the input sequences.
|
190 |
-
|
191 |
-
Args:
|
192 |
-
encoded_inputs (Dict[str, List[List[int]]): Tokenized inputs with keys like 'input_ids' as tensors.
|
193 |
-
|
194 |
-
Returns:
|
195 |
-
Dict[str, List[List[int]]]: Tokenized inputs without ambiguous amino acids.
|
196 |
-
"""
|
197 |
|
198 |
encoded_inputs = self._pad(
|
199 |
encoded_inputs,
|
|
|
24 |
unk_token_id: int,
|
25 |
model_max_length: int,
|
26 |
other_special_token_ids: Optional[List[int]] = None,
|
|
|
27 |
**kwargs,
|
28 |
):
|
29 |
"""Vocabulary comprising the amino acids, and the special tokens <unk>, <bos>, <eos>, <pad> and <mask>.
|
|
|
68 |
unk_token_id=unk_token_id,
|
69 |
unk_token=id_to_token.get(unk_token_id),
|
70 |
other_special_token_ids=other_special_token_ids,
|
|
|
71 |
model_input_names=["input_ids", "attention_mask", "special_tokens_mask"],
|
72 |
tokenizer_object=tokenizer_object,
|
73 |
)
|
|
|
75 |
if other_special_token_ids is not None:
|
76 |
self.add_special_tokens({"additional_special_tokens": list(id_to_token.get(i) for i in other_special_token_ids)})
|
77 |
|
|
|
|
|
78 |
self.key_to_padding = {"input_ids": self.pad_token_id, "attention_mask": 0, "special_tokens_mask": 1, "position_ids": 0}
|
79 |
self.key_to_dtype = {
|
80 |
"input_ids": torch.long,
|
|
|
126 |
"""
|
127 |
|
128 |
for i, sequence in enumerate(encoded_inputs["input_ids"]):
|
129 |
+
mask = [token_id != self.unk_token_id for token_id in sequence]
|
130 |
for key in encoded_inputs:
|
131 |
encoded_inputs[key][i] = list(compress(encoded_inputs[key][i], mask))
|
132 |
return encoded_inputs
|
|
|
139 |
pad_to_multiple_of: int = 8,
|
140 |
**kwargs,
|
141 |
) -> Dict[str, List[List[int]]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
if isinstance(encoded_inputs, list):
|
144 |
tmp = dict()
|
|
|
172 |
return_tensors: str = "pt",
|
173 |
**kwargs,
|
174 |
) -> Dict[str, List[List[int]]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
|
176 |
encoded_inputs = self._pad(
|
177 |
encoded_inputs,
|
tokenizer_config.json
CHANGED
@@ -41,15 +41,6 @@
|
|
41 |
"special": true
|
42 |
}
|
43 |
},
|
44 |
-
"ambiguous_token_ids": [
|
45 |
-
1,
|
46 |
-
6,
|
47 |
-
7,
|
48 |
-
8,
|
49 |
-
9,
|
50 |
-
10,
|
51 |
-
11
|
52 |
-
],
|
53 |
"auto_map": {
|
54 |
"AutoTokenizer": [
|
55 |
"tokenizer.ProteinTokenizer",
|
|
|
41 |
"special": true
|
42 |
}
|
43 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
"auto_map": {
|
45 |
"AutoTokenizer": [
|
46 |
"tokenizer.ProteinTokenizer",
|