Lolalb commited on
Commit
8c37c51
·
verified ·
1 Parent(s): de2fafd

Upload tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer.py +13 -3
  2. tokenizer_config.json +10 -1
tokenizer.py CHANGED
@@ -1,6 +1,7 @@
1
  import torch
2
  from typing import List, Optional, Union, Dict
3
  from torch import Tensor
 
4
 
5
  from itertools import compress
6
 
@@ -21,7 +22,7 @@ class ProteinTokenizer(PreTrainedTokenizerFast):
21
  bos_token_id: int,
22
  eos_token_id: int,
23
  unk_token_id: int,
24
- max_length: int,
25
  other_special_token_ids: Optional[List[int]] = None,
26
  ambiguous_token_ids: Optional[List[int]] = None, # str = "XBOUZJ"
27
  **kwargs,
@@ -54,16 +55,25 @@ class ProteinTokenizer(PreTrainedTokenizerFast):
54
  tokenizer_object.pre_tokenizer = Split("", behavior="removed")
55
 
56
  super().__init__(
57
- model_max_length=max_length,
 
58
  padding_side="right",
59
  truncation_side="right",
 
60
  pad_token=id_to_token.get(pad_token_id),
 
 
 
61
  bos_token=id_to_token.get(bos_token_id),
 
62
  eos_token=id_to_token.get(eos_token_id),
 
63
  unk_token=id_to_token.get(unk_token_id),
64
- mask_token=id_to_token.get(mask_token_id),
 
65
  model_input_names=["input_ids", "attention_mask", "special_tokens_mask"],
66
  tokenizer_object=tokenizer_object,
 
67
  )
68
 
69
  if other_special_token_ids is not None:
 
1
  import torch
2
  from typing import List, Optional, Union, Dict
3
  from torch import Tensor
4
+ import copy
5
 
6
  from itertools import compress
7
 
 
22
  bos_token_id: int,
23
  eos_token_id: int,
24
  unk_token_id: int,
25
+ model_max_length: int,
26
  other_special_token_ids: Optional[List[int]] = None,
27
  ambiguous_token_ids: Optional[List[int]] = None, # str = "XBOUZJ"
28
  **kwargs,
 
55
  tokenizer_object.pre_tokenizer = Split("", behavior="removed")
56
 
57
  super().__init__(
58
+ vocab_path=vocab_path,
59
+ model_max_length=model_max_length,
60
  padding_side="right",
61
  truncation_side="right",
62
+ pad_token_id=pad_token_id,
63
  pad_token=id_to_token.get(pad_token_id),
64
+ mask_token_id=mask_token_id,
65
+ mask_token=id_to_token.get(mask_token_id),
66
+ bos_token_id=bos_token_id,
67
  bos_token=id_to_token.get(bos_token_id),
68
+ eos_token_id=eos_token_id,
69
  eos_token=id_to_token.get(eos_token_id),
70
+ unk_token_id=unk_token_id,
71
  unk_token=id_to_token.get(unk_token_id),
72
+ other_special_token_ids=other_special_token_ids,
73
+ ambiguous_token_ids=ambiguous_token_ids,
74
  model_input_names=["input_ids", "attention_mask", "special_tokens_mask"],
75
  tokenizer_object=tokenizer_object,
76
+ **kwargs,
77
  )
78
 
79
  if other_special_token_ids is not None:
tokenizer_config.json CHANGED
@@ -41,6 +41,7 @@
41
  "special": true
42
  }
43
  },
 
44
  "auto_map": {
45
  "AutoTokenizer": [
46
  "tokenizer.ProteinTokenizer",
@@ -48,18 +49,26 @@
48
  ]
49
  },
50
  "bos_token": "<bos>",
 
51
  "clean_up_tokenization_spaces": false,
52
  "eos_token": "<eos>",
 
53
  "mask_token": "<mask>",
 
54
  "model_input_names": [
55
  "input_ids",
56
  "attention_mask",
57
  "special_tokens_mask"
58
  ],
59
  "model_max_length": 2048,
 
60
  "pad_token": "<pad>",
 
61
  "padding_side": "right",
62
  "tokenizer_class": "ProteinTokenizer",
63
  "truncation_side": "right",
64
- "unk_token": "<unk>"
 
 
 
65
  }
 
41
  "special": true
42
  }
43
  },
44
+ "ambiguous_token_ids": null,
45
  "auto_map": {
46
  "AutoTokenizer": [
47
  "tokenizer.ProteinTokenizer",
 
49
  ]
50
  },
51
  "bos_token": "<bos>",
52
+ "bos_token_id": 3,
53
  "clean_up_tokenization_spaces": false,
54
  "eos_token": "<eos>",
55
+ "eos_token_id": 4,
56
  "mask_token": "<mask>",
57
+ "mask_token_id": 2,
58
  "model_input_names": [
59
  "input_ids",
60
  "attention_mask",
61
  "special_tokens_mask"
62
  ],
63
  "model_max_length": 2048,
64
+ "other_special_token_ids": null,
65
  "pad_token": "<pad>",
66
+ "pad_token_id": 0,
67
  "padding_side": "right",
68
  "tokenizer_class": "ProteinTokenizer",
69
  "truncation_side": "right",
70
+ "unk_token": "<unk>",
71
+ "unk_token_id": 1,
72
+ "vocab_path": "/home/mila/l/lola.lebreton/AMPLIFY/conf/tokenizer/amplify_vocab.txt",
73
+ "vocab_size": 27
74
  }