Upload tokenizer
Browse files- tokenizer.py +6 -7
- tokenizer_config.json +29 -1
tokenizer.py
CHANGED
@@ -16,7 +16,7 @@ class ProteinTokenizer(PreTrainedTokenizerFast):
|
|
16 |
|
17 |
def __init__(
|
18 |
self,
|
19 |
-
|
20 |
pad_token_id: int,
|
21 |
mask_token_id: int,
|
22 |
bos_token_id: int,
|
@@ -42,11 +42,10 @@ class ProteinTokenizer(PreTrainedTokenizerFast):
|
|
42 |
token_to_id = dict()
|
43 |
id_to_token = dict()
|
44 |
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
id_to_token[i] = token
|
50 |
|
51 |
# Define tokenizer and model
|
52 |
tokenizer_object = Tokenizer(WordPiece(vocab=token_to_id, unk_token=id_to_token.get(unk_token_id)))
|
@@ -55,7 +54,7 @@ class ProteinTokenizer(PreTrainedTokenizerFast):
|
|
55 |
tokenizer_object.pre_tokenizer = Split("", behavior="removed")
|
56 |
|
57 |
super().__init__(
|
58 |
-
|
59 |
model_max_length=model_max_length,
|
60 |
padding_side="right",
|
61 |
truncation_side="right",
|
|
|
16 |
|
17 |
def __init__(
|
18 |
self,
|
19 |
+
vocab: dict,
|
20 |
pad_token_id: int,
|
21 |
mask_token_id: int,
|
22 |
bos_token_id: int,
|
|
|
42 |
token_to_id = dict()
|
43 |
id_to_token = dict()
|
44 |
|
45 |
+
for token, token_id in vocab.items():
|
46 |
+
token = token.strip()
|
47 |
+
token_to_id[token] = token_id
|
48 |
+
id_to_token[token_id] = token
|
|
|
49 |
|
50 |
# Define tokenizer and model
|
51 |
tokenizer_object = Tokenizer(WordPiece(vocab=token_to_id, unk_token=id_to_token.get(unk_token_id)))
|
|
|
54 |
tokenizer_object.pre_tokenizer = Split("", behavior="removed")
|
55 |
|
56 |
super().__init__(
|
57 |
+
vocab=vocab,
|
58 |
model_max_length=model_max_length,
|
59 |
padding_side="right",
|
60 |
truncation_side="right",
|
tokenizer_config.json
CHANGED
@@ -77,5 +77,33 @@
|
|
77 |
"truncation_side": "right",
|
78 |
"unk_token": "<unk>",
|
79 |
"unk_token_id": 1,
|
80 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
}
|
|
|
77 |
"truncation_side": "right",
|
78 |
"unk_token": "<unk>",
|
79 |
"unk_token_id": 1,
|
80 |
+
"vocab": {
|
81 |
+
"<bos>": 3,
|
82 |
+
"<eos>": 4,
|
83 |
+
"<mask>": 2,
|
84 |
+
"<pad>": 0,
|
85 |
+
"<unk>": 1,
|
86 |
+
"A": 7,
|
87 |
+
"B": 26,
|
88 |
+
"C": 25,
|
89 |
+
"D": 15,
|
90 |
+
"E": 11,
|
91 |
+
"F": 20,
|
92 |
+
"G": 8,
|
93 |
+
"H": 23,
|
94 |
+
"I": 14,
|
95 |
+
"K": 17,
|
96 |
+
"L": 6,
|
97 |
+
"M": 22,
|
98 |
+
"N": 19,
|
99 |
+
"P": 16,
|
100 |
+
"Q": 18,
|
101 |
+
"R": 12,
|
102 |
+
"S": 10,
|
103 |
+
"T": 13,
|
104 |
+
"V": 9,
|
105 |
+
"W": 24,
|
106 |
+
"Y": 21,
|
107 |
+
"|": 5
|
108 |
+
}
|
109 |
}
|