jon-tow commited on
Commit
dfe1f6f
·
1 Parent(s): ee2a4ae

fix: make `eos_token`/`pad_token` overridable

Browse files
tokenization_arcade100k.py CHANGED
@@ -124,8 +124,12 @@ class Arcade100kTokenizer(PreTrainedTokenizer):
124
 
125
  self.decoder = {i: n for n, i in self.tokenizer._mergeable_ranks.items()}
126
  self.decoder.update({i: n for n, i in self.tokenizer._special_tokens.items()})
127
- self.eos_token = self.decoder[self.tokenizer.eot_token]
128
- self.pad_token = self.decoder[self.tokenizer.eot_token]
 
 
 
 
129
  # Expose for convenience
130
  self.mergeable_ranks = self.tokenizer._mergeable_ranks
131
  self.special_tokens = self.tokenizer._special_tokens
 
124
 
125
  self.decoder = {i: n for n, i in self.tokenizer._mergeable_ranks.items()}
126
  self.decoder.update({i: n for n, i in self.tokenizer._special_tokens.items()})
127
+ # Provide default `eos_token` and `pad_token`
128
+ if self.eos_token is None:
129
+ self.eos_token = self.decoder[self.tokenizer.eot_token]
130
+ if self.pad_token is None:
131
+ self.pad_token = self.decoder[self.tokenizer.pad_token]
132
+
133
  # Expose for convenience
134
  self.mergeable_ranks = self.tokenizer._mergeable_ranks
135
  self.special_tokens = self.tokenizer._special_tokens
tokenizer_config.json CHANGED
@@ -5,5 +5,7 @@
5
  "tokenization_arcade100k.Arcade100kTokenizer",
6
  null
7
  ]
8
- }
 
 
9
  }
 
5
  "tokenization_arcade100k.Arcade100kTokenizer",
6
  null
7
  ]
8
+ },
9
+ "eos_token": "<|endoftext|>",
10
+ "pad_token": "<|endoftext|>"
11
  }