fix: make `eos_token`/`pad_token` overridable
Browse files- tokenization_arcade100k.py +6 -2
- tokenizer_config.json +3 -1
tokenization_arcade100k.py
CHANGED
@@ -124,8 +124,12 @@ class Arcade100kTokenizer(PreTrainedTokenizer):
|
|
124 |
|
125 |
self.decoder = {i: n for n, i in self.tokenizer._mergeable_ranks.items()}
|
126 |
self.decoder.update({i: n for n, i in self.tokenizer._special_tokens.items()})
|
127 |
-
|
128 |
-
self.
|
|
|
|
|
|
|
|
|
129 |
# Expose for convenience
|
130 |
self.mergeable_ranks = self.tokenizer._mergeable_ranks
|
131 |
self.special_tokens = self.tokenizer._special_tokens
|
|
|
124 |
|
125 |
self.decoder = {i: n for n, i in self.tokenizer._mergeable_ranks.items()}
|
126 |
self.decoder.update({i: n for n, i in self.tokenizer._special_tokens.items()})
|
127 |
+
# Provide default `eos_token` and `pad_token`
|
128 |
+
if self.eos_token is None:
|
129 |
+
self.eos_token = self.decoder[self.tokenizer.eot_token]
|
130 |
+
if self.pad_token is None:
|
131 |
+
self.pad_token = self.decoder[self.tokenizer.pad_token]
|
132 |
+
|
133 |
# Expose for convenience
|
134 |
self.mergeable_ranks = self.tokenizer._mergeable_ranks
|
135 |
self.special_tokens = self.tokenizer._special_tokens
|
tokenizer_config.json
CHANGED
@@ -5,5 +5,7 @@
|
|
5 |
"tokenization_arcade100k.Arcade100kTokenizer",
|
6 |
null
|
7 |
]
|
8 |
-
}
|
|
|
|
|
9 |
}
|
|
|
5 |
"tokenization_arcade100k.Arcade100kTokenizer",
|
6 |
null
|
7 |
]
|
8 |
+
},
|
9 |
+
"eos_token": "<|endoftext|>",
|
10 |
+
"pad_token": "<|endoftext|>"
|
11 |
}
|