Lolalb
/

AMPLIFY_350M

Feature Extraction

Model card Files Files and versions Community

Lolalb commited on 25 days ago

Commit

dea2d8a

·

verified ·

1 Parent(s): a70e7f8

Upload tokenizer

Files changed (2) hide show

tokenizer.py +6 -7
tokenizer_config.json +29 -1

tokenizer.py CHANGED Viewed

@@ -16,7 +16,7 @@ class ProteinTokenizer(PreTrainedTokenizerFast):
     def __init__(
         self,
-        vocab_path: str,
         pad_token_id: int,
         mask_token_id: int,
         bos_token_id: int,
@@ -42,11 +42,10 @@ class ProteinTokenizer(PreTrainedTokenizerFast):
         token_to_id = dict()
         id_to_token = dict()
-        with open(vocab_path, "r") as vocab_file:
-            for i, token in enumerate(vocab_file):
-                token = token.strip()
-                token_to_id[token] = i
-                id_to_token[i] = token
         # Define tokenizer and model
         tokenizer_object = Tokenizer(WordPiece(vocab=token_to_id, unk_token=id_to_token.get(unk_token_id)))
@@ -55,7 +54,7 @@ class ProteinTokenizer(PreTrainedTokenizerFast):
         tokenizer_object.pre_tokenizer = Split("", behavior="removed")
         super().__init__(
-            vocab_path=vocab_path,
             model_max_length=model_max_length,
             padding_side="right",
             truncation_side="right",

     def __init__(
         self,
+        vocab: dict,
         pad_token_id: int,
         mask_token_id: int,
         bos_token_id: int,
         token_to_id = dict()
         id_to_token = dict()
+        for token, token_id in vocab.items():
+            token = token.strip()
+            token_to_id[token] = token_id
+            id_to_token[token_id] = token
         # Define tokenizer and model
         tokenizer_object = Tokenizer(WordPiece(vocab=token_to_id, unk_token=id_to_token.get(unk_token_id)))
         tokenizer_object.pre_tokenizer = Split("", behavior="removed")
         super().__init__(
+            vocab=vocab,
             model_max_length=model_max_length,
             padding_side="right",
             truncation_side="right",

tokenizer_config.json CHANGED Viewed

@@ -77,5 +77,33 @@
   "truncation_side": "right",
   "unk_token": "<unk>",
   "unk_token_id": 1,
-  "vocab_path": "conf/tokenizer/amplify_vocab.txt"
 }

   "truncation_side": "right",
   "unk_token": "<unk>",
   "unk_token_id": 1,
+  "vocab": {
+    "<bos>": 3,
+    "<eos>": 4,
+    "<mask>": 2,
+    "<pad>": 0,
+    "<unk>": 1,
+    "A": 7,
+    "B": 26,
+    "C": 25,
+    "D": 15,
+    "E": 11,
+    "F": 20,
+    "G": 8,
+    "H": 23,
+    "I": 14,
+    "K": 17,
+    "L": 6,
+    "M": 22,
+    "N": 19,
+    "P": 16,
+    "Q": 18,
+    "R": 12,
+    "S": 10,
+    "T": 13,
+    "V": 9,
+    "W": 24,
+    "Y": 21,
+    "|": 5
+  }
 }