Upload tokenizer
Browse files- tokenizer.py +13 -3
- tokenizer_config.json +10 -1
tokenizer.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import torch
|
2 |
from typing import List, Optional, Union, Dict
|
3 |
from torch import Tensor
|
|
|
4 |
|
5 |
from itertools import compress
|
6 |
|
@@ -21,7 +22,7 @@ class ProteinTokenizer(PreTrainedTokenizerFast):
|
|
21 |
bos_token_id: int,
|
22 |
eos_token_id: int,
|
23 |
unk_token_id: int,
|
24 |
-
|
25 |
other_special_token_ids: Optional[List[int]] = None,
|
26 |
ambiguous_token_ids: Optional[List[int]] = None, # str = "XBOUZJ"
|
27 |
**kwargs,
|
@@ -54,16 +55,25 @@ class ProteinTokenizer(PreTrainedTokenizerFast):
|
|
54 |
tokenizer_object.pre_tokenizer = Split("", behavior="removed")
|
55 |
|
56 |
super().__init__(
|
57 |
-
|
|
|
58 |
padding_side="right",
|
59 |
truncation_side="right",
|
|
|
60 |
pad_token=id_to_token.get(pad_token_id),
|
|
|
|
|
|
|
61 |
bos_token=id_to_token.get(bos_token_id),
|
|
|
62 |
eos_token=id_to_token.get(eos_token_id),
|
|
|
63 |
unk_token=id_to_token.get(unk_token_id),
|
64 |
-
|
|
|
65 |
model_input_names=["input_ids", "attention_mask", "special_tokens_mask"],
|
66 |
tokenizer_object=tokenizer_object,
|
|
|
67 |
)
|
68 |
|
69 |
if other_special_token_ids is not None:
|
|
|
1 |
import torch
|
2 |
from typing import List, Optional, Union, Dict
|
3 |
from torch import Tensor
|
4 |
+
import copy
|
5 |
|
6 |
from itertools import compress
|
7 |
|
|
|
22 |
bos_token_id: int,
|
23 |
eos_token_id: int,
|
24 |
unk_token_id: int,
|
25 |
+
model_max_length: int,
|
26 |
other_special_token_ids: Optional[List[int]] = None,
|
27 |
ambiguous_token_ids: Optional[List[int]] = None, # str = "XBOUZJ"
|
28 |
**kwargs,
|
|
|
55 |
tokenizer_object.pre_tokenizer = Split("", behavior="removed")
|
56 |
|
57 |
super().__init__(
|
58 |
+
vocab_path=vocab_path,
|
59 |
+
model_max_length=model_max_length,
|
60 |
padding_side="right",
|
61 |
truncation_side="right",
|
62 |
+
pad_token_id=pad_token_id,
|
63 |
pad_token=id_to_token.get(pad_token_id),
|
64 |
+
mask_token_id=mask_token_id,
|
65 |
+
mask_token=id_to_token.get(mask_token_id),
|
66 |
+
bos_token_id=bos_token_id,
|
67 |
bos_token=id_to_token.get(bos_token_id),
|
68 |
+
eos_token_id=eos_token_id,
|
69 |
eos_token=id_to_token.get(eos_token_id),
|
70 |
+
unk_token_id=unk_token_id,
|
71 |
unk_token=id_to_token.get(unk_token_id),
|
72 |
+
other_special_token_ids=other_special_token_ids,
|
73 |
+
ambiguous_token_ids=ambiguous_token_ids,
|
74 |
model_input_names=["input_ids", "attention_mask", "special_tokens_mask"],
|
75 |
tokenizer_object=tokenizer_object,
|
76 |
+
**kwargs,
|
77 |
)
|
78 |
|
79 |
if other_special_token_ids is not None:
|
tokenizer_config.json
CHANGED
@@ -41,6 +41,7 @@
|
|
41 |
"special": true
|
42 |
}
|
43 |
},
|
|
|
44 |
"auto_map": {
|
45 |
"AutoTokenizer": [
|
46 |
"tokenizer.ProteinTokenizer",
|
@@ -48,18 +49,26 @@
|
|
48 |
]
|
49 |
},
|
50 |
"bos_token": "<bos>",
|
|
|
51 |
"clean_up_tokenization_spaces": false,
|
52 |
"eos_token": "<eos>",
|
|
|
53 |
"mask_token": "<mask>",
|
|
|
54 |
"model_input_names": [
|
55 |
"input_ids",
|
56 |
"attention_mask",
|
57 |
"special_tokens_mask"
|
58 |
],
|
59 |
"model_max_length": 2048,
|
|
|
60 |
"pad_token": "<pad>",
|
|
|
61 |
"padding_side": "right",
|
62 |
"tokenizer_class": "ProteinTokenizer",
|
63 |
"truncation_side": "right",
|
64 |
-
"unk_token": "<unk>"
|
|
|
|
|
|
|
65 |
}
|
|
|
41 |
"special": true
|
42 |
}
|
43 |
},
|
44 |
+
"ambiguous_token_ids": null,
|
45 |
"auto_map": {
|
46 |
"AutoTokenizer": [
|
47 |
"tokenizer.ProteinTokenizer",
|
|
|
49 |
]
|
50 |
},
|
51 |
"bos_token": "<bos>",
|
52 |
+
"bos_token_id": 3,
|
53 |
"clean_up_tokenization_spaces": false,
|
54 |
"eos_token": "<eos>",
|
55 |
+
"eos_token_id": 4,
|
56 |
"mask_token": "<mask>",
|
57 |
+
"mask_token_id": 2,
|
58 |
"model_input_names": [
|
59 |
"input_ids",
|
60 |
"attention_mask",
|
61 |
"special_tokens_mask"
|
62 |
],
|
63 |
"model_max_length": 2048,
|
64 |
+
"other_special_token_ids": null,
|
65 |
"pad_token": "<pad>",
|
66 |
+
"pad_token_id": 0,
|
67 |
"padding_side": "right",
|
68 |
"tokenizer_class": "ProteinTokenizer",
|
69 |
"truncation_side": "right",
|
70 |
+
"unk_token": "<unk>",
|
71 |
+
"unk_token_id": 1,
|
72 |
+
"vocab_path": "/home/mila/l/lola.lebreton/AMPLIFY/conf/tokenizer/amplify_vocab.txt",
|
73 |
+
"vocab_size": 27
|
74 |
}
|