jon-tow commited on
Commit
f221007
·
1 Parent(s): 3c66e0d

fix: re-ordering special tokens

Browse files
Files changed (1) hide show
  1. tokenization_arcade100k.py +12 -10
tokenization_arcade100k.py CHANGED
@@ -42,12 +42,14 @@ def _arcade100k(vocab_file: str):
42
  mergeable_ranks = _load_tiktoken_bpe(vocab_file)
43
 
44
  ENDOFTEXT = "<|endoftext|>"
45
- # StarCoder special tokens (https://huggingface.co/bigcode/starcoder/blob/main/tokenizer_config.json)
 
 
 
 
 
 
46
  CODE = [
47
- "<fim_prefix>",
48
- "<fim_middle>",
49
- "<fim_suffix>",
50
- "<fim_pad>",
51
  "<gh_stars>",
52
  "<filename>",
53
  "<issue_start>",
@@ -68,10 +70,9 @@ def _arcade100k(vocab_file: str):
68
  "<|im_end|>", # Chat: Input message end
69
  ]
70
  PAUSE = "<|pause|>" # Think before you speak (https://arxiv.org/abs/2310.02226)
71
- REGISTERS = [f"<|reg{i}|>" for i in range(0, 8)] # Register/sink tokens (https://arxiv.org/abs/2309.17453)
72
  ENDOFPROMPT = "<|endofprompt|>"
73
-
74
- SPECIAL_TOKENS_NAMES = [ENDOFTEXT] + CODE + [ENDOFPROMPT] + CHAT + [PAUSE] + REGISTERS
75
  START_ID = len(mergeable_ranks) + 1
76
  SPECIAL_TOKENS = {
77
  t: START_ID + i
@@ -110,8 +111,9 @@ class Arcade100kTokenizer(PreTrainedTokenizer):
110
  **kwargs,
111
  ):
112
  super().__init__(errors=errors, **kwargs)
113
- self._tiktoken_config = _arcade100k(self.vocab_files_names["vocab_file"])
114
  self.tokenizer = tiktoken.Encoding(**self._tiktoken_config)
 
115
  # TODO: Remove this assertion
116
  assert (
117
  len(self.tokenizer._mergeable_ranks)
@@ -174,7 +176,7 @@ class Arcade100kTokenizer(PreTrainedTokenizer):
174
  Returns:
175
  `Tuple(str)`: Paths to the files saved.
176
  """
177
- file_path = os.path.join(save_directory, "qwen.tiktoken")
178
  with open(file_path, "w", encoding="utf8") as w:
179
  for k, v in self.tokenizer._mergeable_ranks.items():
180
  line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
 
42
  mergeable_ranks = _load_tiktoken_bpe(vocab_file)
43
 
44
  ENDOFTEXT = "<|endoftext|>"
45
+ FIM = [
46
+ "<|fim_prefix|>",
47
+ "<|fim_middle|>",
48
+ "<|fim_suffix|>",
49
+ "<|fim_pad|>",
50
+ ]
51
+ # `StarCoder` Tokens
52
  CODE = [
 
 
 
 
53
  "<gh_stars>",
54
  "<filename>",
55
  "<issue_start>",
 
70
  "<|im_end|>", # Chat: Input message end
71
  ]
72
  PAUSE = "<|pause|>" # Think before you speak (https://arxiv.org/abs/2310.02226)
73
+ REGISTERS = [f"<|reg{i}|>" for i in range(0, 8)] # Register 0 sink token (https://arxiv.org/abs/2309.17453)
74
  ENDOFPROMPT = "<|endofprompt|>"
75
+ SPECIAL_TOKENS_NAMES = [ENDOFTEXT] + FIM + CODE + [ENDOFPROMPT] + CHAT + [PAUSE] + REGISTERS + ["<|extra0|>"]
 
76
  START_ID = len(mergeable_ranks) + 1
77
  SPECIAL_TOKENS = {
78
  t: START_ID + i
 
111
  **kwargs,
112
  ):
113
  super().__init__(errors=errors, **kwargs)
114
+ self._tiktoken_config = _arcade100k(vocab_file)
115
  self.tokenizer = tiktoken.Encoding(**self._tiktoken_config)
116
+
117
  # TODO: Remove this assertion
118
  assert (
119
  len(self.tokenizer._mergeable_ranks)
 
176
  Returns:
177
  `Tuple(str)`: Paths to the files saved.
178
  """
179
+ file_path = os.path.join(save_directory, "arcade100k.tiktoken")
180
  with open(file_path, "w", encoding="utf8") as w:
181
  for k, v in self.tokenizer._mergeable_ranks.items():
182
  line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"