Lolalb commited on
Commit
dea2d8a
·
verified ·
1 Parent(s): a70e7f8

Upload tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer.py +6 -7
  2. tokenizer_config.json +29 -1
tokenizer.py CHANGED
@@ -16,7 +16,7 @@ class ProteinTokenizer(PreTrainedTokenizerFast):
16
 
17
  def __init__(
18
  self,
19
- vocab_path: str,
20
  pad_token_id: int,
21
  mask_token_id: int,
22
  bos_token_id: int,
@@ -42,11 +42,10 @@ class ProteinTokenizer(PreTrainedTokenizerFast):
42
  token_to_id = dict()
43
  id_to_token = dict()
44
 
45
- with open(vocab_path, "r") as vocab_file:
46
- for i, token in enumerate(vocab_file):
47
- token = token.strip()
48
- token_to_id[token] = i
49
- id_to_token[i] = token
50
 
51
  # Define tokenizer and model
52
  tokenizer_object = Tokenizer(WordPiece(vocab=token_to_id, unk_token=id_to_token.get(unk_token_id)))
@@ -55,7 +54,7 @@ class ProteinTokenizer(PreTrainedTokenizerFast):
55
  tokenizer_object.pre_tokenizer = Split("", behavior="removed")
56
 
57
  super().__init__(
58
- vocab_path=vocab_path,
59
  model_max_length=model_max_length,
60
  padding_side="right",
61
  truncation_side="right",
 
16
 
17
  def __init__(
18
  self,
19
+ vocab: dict,
20
  pad_token_id: int,
21
  mask_token_id: int,
22
  bos_token_id: int,
 
42
  token_to_id = dict()
43
  id_to_token = dict()
44
 
45
+ for token, token_id in vocab.items():
46
+ token = token.strip()
47
+ token_to_id[token] = token_id
48
+ id_to_token[token_id] = token
 
49
 
50
  # Define tokenizer and model
51
  tokenizer_object = Tokenizer(WordPiece(vocab=token_to_id, unk_token=id_to_token.get(unk_token_id)))
 
54
  tokenizer_object.pre_tokenizer = Split("", behavior="removed")
55
 
56
  super().__init__(
57
+ vocab=vocab,
58
  model_max_length=model_max_length,
59
  padding_side="right",
60
  truncation_side="right",
tokenizer_config.json CHANGED
@@ -77,5 +77,33 @@
77
  "truncation_side": "right",
78
  "unk_token": "<unk>",
79
  "unk_token_id": 1,
80
- "vocab_path": "conf/tokenizer/amplify_vocab.txt"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  }
 
77
  "truncation_side": "right",
78
  "unk_token": "<unk>",
79
  "unk_token_id": 1,
80
+ "vocab": {
81
+ "<bos>": 3,
82
+ "<eos>": 4,
83
+ "<mask>": 2,
84
+ "<pad>": 0,
85
+ "<unk>": 1,
86
+ "A": 7,
87
+ "B": 26,
88
+ "C": 25,
89
+ "D": 15,
90
+ "E": 11,
91
+ "F": 20,
92
+ "G": 8,
93
+ "H": 23,
94
+ "I": 14,
95
+ "K": 17,
96
+ "L": 6,
97
+ "M": 22,
98
+ "N": 19,
99
+ "P": 16,
100
+ "Q": 18,
101
+ "R": 12,
102
+ "S": 10,
103
+ "T": 13,
104
+ "V": 9,
105
+ "W": 24,
106
+ "Y": 21,
107
+ "|": 5
108
+ }
109
  }