fix: re-ordering special tokens
Browse files- tokenization_arcade100k.py +12 -10
tokenization_arcade100k.py
CHANGED
@@ -42,12 +42,14 @@ def _arcade100k(vocab_file: str):
|
|
42 |
mergeable_ranks = _load_tiktoken_bpe(vocab_file)
|
43 |
|
44 |
ENDOFTEXT = "<|endoftext|>"
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
CODE = [
|
47 |
-
"<fim_prefix>",
|
48 |
-
"<fim_middle>",
|
49 |
-
"<fim_suffix>",
|
50 |
-
"<fim_pad>",
|
51 |
"<gh_stars>",
|
52 |
"<filename>",
|
53 |
"<issue_start>",
|
@@ -68,10 +70,9 @@ def _arcade100k(vocab_file: str):
|
|
68 |
"<|im_end|>", # Chat: Input message end
|
69 |
]
|
70 |
PAUSE = "<|pause|>" # Think before you speak (https://arxiv.org/abs/2310.02226)
|
71 |
-
REGISTERS = [f"<|reg{i}|>" for i in range(0, 8)] # Register
|
72 |
ENDOFPROMPT = "<|endofprompt|>"
|
73 |
-
|
74 |
-
SPECIAL_TOKENS_NAMES = [ENDOFTEXT] + CODE + [ENDOFPROMPT] + CHAT + [PAUSE] + REGISTERS
|
75 |
START_ID = len(mergeable_ranks) + 1
|
76 |
SPECIAL_TOKENS = {
|
77 |
t: START_ID + i
|
@@ -110,8 +111,9 @@ class Arcade100kTokenizer(PreTrainedTokenizer):
|
|
110 |
**kwargs,
|
111 |
):
|
112 |
super().__init__(errors=errors, **kwargs)
|
113 |
-
self._tiktoken_config = _arcade100k(
|
114 |
self.tokenizer = tiktoken.Encoding(**self._tiktoken_config)
|
|
|
115 |
# TODO: Remove this assertion
|
116 |
assert (
|
117 |
len(self.tokenizer._mergeable_ranks)
|
@@ -174,7 +176,7 @@ class Arcade100kTokenizer(PreTrainedTokenizer):
|
|
174 |
Returns:
|
175 |
`Tuple(str)`: Paths to the files saved.
|
176 |
"""
|
177 |
-
file_path = os.path.join(save_directory, "
|
178 |
with open(file_path, "w", encoding="utf8") as w:
|
179 |
for k, v in self.tokenizer._mergeable_ranks.items():
|
180 |
line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
|
|
|
42 |
mergeable_ranks = _load_tiktoken_bpe(vocab_file)
|
43 |
|
44 |
ENDOFTEXT = "<|endoftext|>"
|
45 |
+
FIM = [
|
46 |
+
"<|fim_prefix|>",
|
47 |
+
"<|fim_middle|>",
|
48 |
+
"<|fim_suffix|>",
|
49 |
+
"<|fim_pad|>",
|
50 |
+
]
|
51 |
+
# `StarCoder` Tokens
|
52 |
CODE = [
|
|
|
|
|
|
|
|
|
53 |
"<gh_stars>",
|
54 |
"<filename>",
|
55 |
"<issue_start>",
|
|
|
70 |
"<|im_end|>", # Chat: Input message end
|
71 |
]
|
72 |
PAUSE = "<|pause|>" # Think before you speak (https://arxiv.org/abs/2310.02226)
|
73 |
+
REGISTERS = [f"<|reg{i}|>" for i in range(0, 8)] # Register 0 sink token (https://arxiv.org/abs/2309.17453)
|
74 |
ENDOFPROMPT = "<|endofprompt|>"
|
75 |
+
SPECIAL_TOKENS_NAMES = [ENDOFTEXT] + FIM + CODE + [ENDOFPROMPT] + CHAT + [PAUSE] + REGISTERS + ["<|extra0|>"]
|
|
|
76 |
START_ID = len(mergeable_ranks) + 1
|
77 |
SPECIAL_TOKENS = {
|
78 |
t: START_ID + i
|
|
|
111 |
**kwargs,
|
112 |
):
|
113 |
super().__init__(errors=errors, **kwargs)
|
114 |
+
self._tiktoken_config = _arcade100k(vocab_file)
|
115 |
self.tokenizer = tiktoken.Encoding(**self._tiktoken_config)
|
116 |
+
|
117 |
# TODO: Remove this assertion
|
118 |
assert (
|
119 |
len(self.tokenizer._mergeable_ranks)
|
|
|
176 |
Returns:
|
177 |
`Tuple(str)`: Paths to the files saved.
|
178 |
"""
|
179 |
+
file_path = os.path.join(save_directory, "arcade100k.tiktoken")
|
180 |
with open(file_path, "w", encoding="utf8") as w:
|
181 |
for k, v in self.tokenizer._mergeable_ranks.items():
|
182 |
line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
|