jon-tow commited on
Commit
9b47601
·
1 Parent(s): aceea5b

fix: create final list of special tokens

Browse files
Files changed (1) hide show
  1. tokenization_arcade100k.py +59 -46
tokenization_arcade100k.py CHANGED
@@ -41,42 +41,52 @@ def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
41
  def _arcade100k(vocab_file: str):
42
  mergeable_ranks = _load_tiktoken_bpe(vocab_file)
43
 
44
- # Special Tokens
45
  ENDOFTEXT = "<|endoftext|>"
46
- FIM_PREFIX = "<|fim_prefix|>"
47
- FIM_MIDDLE = "<|fim_middle|>"
48
- FIM_SUFFIX = "<|fim_suffix|>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  ENDOFPROMPT = "<|endofprompt|>"
50
-
51
- # Custom Special Tokens
52
- IM_START = "<|im_start|>" # Chat: Input message start
53
- IM_END = "<|im_end|>" # Chat: Input message end
54
- PAUSE = "<|pause|>" # Think before you speak (https://arxiv.org/abs/2310.02226)
55
- # Register/sink tokens (https://arxiv.org/abs/2309.17453)
56
- REGISTERS = [f"<|reg{i}|>" for i in range(0, 8)]
57
- custom_special_tokens = {
58
- t: 100261 + i for i, t in enumerate([IM_START, IM_END, PAUSE, *REGISTERS])
59
- }
60
- ENDOFPROMPT_ID = 100276
61
-
62
- # Fill-out extra tokens
63
- for i in range(100261 + len(custom_special_tokens), ENDOFPROMPT_ID + 1):
64
- custom_special_tokens[f"<|extra{i}|>"] = i
65
-
66
- special_tokens = {
67
- ENDOFTEXT: 100257,
68
- FIM_PREFIX: 100258,
69
- FIM_MIDDLE: 100259,
70
- FIM_SUFFIX: 100260,
71
- **custom_special_tokens,
72
- ENDOFPROMPT: 100276,
73
  }
74
 
 
 
 
 
75
  return {
76
  "name": NAME,
77
  "pat_str": r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""",
78
  "mergeable_ranks": mergeable_ranks,
79
- "special_tokens": special_tokens,
80
  }
81
 
82
 
@@ -108,41 +118,44 @@ class Arcade100kTokenizer(PreTrainedTokenizer):
108
  self.tokenizer = tiktoken.Encoding(**self._tiktoken_config)
109
  # TODO: Remove this assertion
110
  assert (
111
- len(self._tiktoken_config["mergeable_ranks"])
112
- + len(self._tiktoken_config["special_tokens"])
113
  == self.tokenizer.n_vocab
114
- ), f"{len(self._tiktoken_config['mergeable_ranks']) + len(self._tiktoken_config['special_tokens'])} != {self.tokenizer.n_vocab} in encoding"
115
 
116
  self.decoder = {
117
- i: n for n, i in self._tiktoken_config["mergeable_ranks"].items()
118
  }
119
  self.decoder.update(
120
- {i: n for n, i in self._tiktoken_config["special_tokens"].items()}
121
  )
122
  self.eos_token = self.decoder[self.tokenizer.eot_token]
123
  self.pad_token = self.decoder[self.tokenizer.eot_token]
124
 
 
 
 
125
  @property
126
  def vocab_size(self):
127
  return self.tokenizer.n_vocab
128
 
129
  def get_vocab(self) -> Dict[bytes, int]:
130
- return self._tiktoken_config["mergeable_ranks"]
131
 
132
  def convert_tokens_to_ids(
133
  self, tokens: Union[bytes, str, List[Union[bytes, str]]]
134
  ) -> List[int]:
135
  ids = []
136
  if isinstance(tokens, (str, bytes)):
137
- if tokens in self._tiktoken_config["special_tokens"]:
138
- return self._tiktoken_config["special_tokens"][tokens]
139
  else:
140
- return self._tiktoken_config["mergeable_ranks"].get(tokens)
141
  for token in tokens:
142
- if token in self._tiktoken_config["special_tokens"]:
143
- ids.append(self._tiktoken_config["special_tokens"][token])
144
  else:
145
- ids.append(self._tiktoken_config["mergeable_ranks"].get(token))
146
  return ids
147
 
148
  def _add_tokens(
@@ -167,7 +180,7 @@ class Arcade100kTokenizer(PreTrainedTokenizer):
167
  """
168
  file_path = os.path.join(save_directory, "qwen.tiktoken")
169
  with open(file_path, "w", encoding="utf8") as w:
170
- for k, v in self._tiktoken_config["mergeable_ranks"].items():
171
  line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
172
  w.write(line)
173
  return (file_path,)
@@ -236,10 +249,10 @@ class Arcade100kTokenizer(PreTrainedTokenizer):
236
 
237
  def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
238
  """Converts a token to an id using the vocab, special tokens included"""
239
- if token in self._tiktoken_config["special_tokens"]:
240
- return self._tiktoken_config["special_tokens"][token]
241
- if token in self._tiktoken_config["mergeable_ranks"]:
242
- return self._tiktoken_config["mergeable_ranks"][token]
243
  raise ValueError("unknown token")
244
 
245
  def _tokenize(self, text: str, **kwargs):
@@ -262,4 +275,4 @@ class Arcade100kTokenizer(PreTrainedTokenizer):
262
  token_ids = [token_ids]
263
  if skip_special_tokens:
264
  token_ids = [i for i in token_ids if i < self.tokenizer.eot_token]
265
- return self.tokenizer.decode(token_ids, errors=errors or self.errors)
 
41
  def _arcade100k(vocab_file: str):
42
  mergeable_ranks = _load_tiktoken_bpe(vocab_file)
43
 
 
44
  ENDOFTEXT = "<|endoftext|>"
45
+ # StarCoder special tokens (https://huggingface.co/bigcode/starcoder/blob/main/tokenizer_config.json)
46
+ CODE = [
47
+ "<fim_prefix>",
48
+ "<fim_middle>",
49
+ "<fim_suffix>",
50
+ "<fim_pad>",
51
+ "<gh_stars>",
52
+ "<filename>",
53
+ "<issue_start>",
54
+ "<issue_comment>",
55
+ "<issue_closed>",
56
+ "<jupyter_start>",
57
+ "<jupyter_text>",
58
+ "<jupyter_code>",
59
+ "<jupyter_output>",
60
+ "<empty_output>",
61
+ "<commit_before>",
62
+ "<commit_msg>",
63
+ "<commit_after>",
64
+ "<reponame>"
65
+ ]
66
+ CHAT = [
67
+ "<|im_start|>", # Chat: Input message start
68
+ "<|im_end|>", # Chat: Input message end
69
+ ]
70
+ PAUSE = "<|pause|>" # Think before you speak (https://arxiv.org/abs/2310.02226)
71
+ REGISTERS = [f"<|reg{i}|>" for i in range(0, 8)] # Register 0 sink token (https://arxiv.org/abs/2309.17453)
72
  ENDOFPROMPT = "<|endofprompt|>"
73
+
74
+ SPECIAL_TOKENS_NAMES = [ENDOFTEXT] + CODE + [ENDOFPROMPT] + CHAT + [PAUSE] + REGISTERS
75
+ START_ID = len(mergeable_ranks) + 1
76
+ SPECIAL_TOKENS = {
77
+ t: START_ID + i
78
+ for i, t in enumerate(SPECIAL_TOKENS_NAMES)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  }
80
 
81
+ print(len(mergeable_ranks))
82
+ print(len(SPECIAL_TOKENS))
83
+ print(len(mergeable_ranks) + len(SPECIAL_TOKENS))
84
+
85
  return {
86
  "name": NAME,
87
  "pat_str": r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""",
88
  "mergeable_ranks": mergeable_ranks,
89
+ "special_tokens": SPECIAL_TOKENS,
90
  }
91
 
92
 
 
118
  self.tokenizer = tiktoken.Encoding(**self._tiktoken_config)
119
  # TODO: Remove this assertion
120
  assert (
121
+ len(self.tokenizer._mergeable_ranks)
122
+ + len(self.tokenizer._special_tokens) + 1
123
  == self.tokenizer.n_vocab
124
+ ), f"{len(self.tokenizer._mergeable_ranks) + len(self.tokenizer._special_tokens)} != {self.tokenizer.n_vocab} in encoding"
125
 
126
  self.decoder = {
127
+ i: n for n, i in self.tokenizer._mergeable_ranks.items()
128
  }
129
  self.decoder.update(
130
+ {i: n for n, i in self.tokenizer._special_tokens.items()}
131
  )
132
  self.eos_token = self.decoder[self.tokenizer.eot_token]
133
  self.pad_token = self.decoder[self.tokenizer.eot_token]
134
 
135
+ def __len__(self):
136
+ return self.tokenizer.n_vocab
137
+
138
  @property
139
  def vocab_size(self):
140
  return self.tokenizer.n_vocab
141
 
142
  def get_vocab(self) -> Dict[bytes, int]:
143
+ return self.tokenizer._mergeable_ranks
144
 
145
  def convert_tokens_to_ids(
146
  self, tokens: Union[bytes, str, List[Union[bytes, str]]]
147
  ) -> List[int]:
148
  ids = []
149
  if isinstance(tokens, (str, bytes)):
150
+ if tokens in self.tokenizer._special_tokens:
151
+ return self.tokenizer._special_tokens[tokens]
152
  else:
153
+ return self.tokenizer._mergeable_ranks.get(tokens)
154
  for token in tokens:
155
+ if token in self.tokenizer._special_tokens:
156
+ ids.append(self.tokenizer._special_tokens[token])
157
  else:
158
+ ids.append(self.tokenizer._mergeable_ranks.get(token))
159
  return ids
160
 
161
  def _add_tokens(
 
180
  """
181
  file_path = os.path.join(save_directory, "qwen.tiktoken")
182
  with open(file_path, "w", encoding="utf8") as w:
183
+ for k, v in self.tokenizer._mergeable_ranks.items():
184
  line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
185
  w.write(line)
186
  return (file_path,)
 
249
 
250
  def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
251
  """Converts a token to an id using the vocab, special tokens included"""
252
+ if token in self.tokenizer._special_tokens:
253
+ return self.tokenizer._special_tokens[token]
254
+ if token in self.tokenizer._mergeable_ranks:
255
+ return self.tokenizer._mergeable_ranks[token]
256
  raise ValueError("unknown token")
257
 
258
  def _tokenize(self, text: str, **kwargs):
 
275
  token_ids = [token_ids]
276
  if skip_special_tokens:
277
  token_ids = [i for i in token_ids if i < self.tokenizer.eot_token]
278
+ return self.tokenizer.decode(token_ids)