duzx16
commited on
Commit
·
3a99d79
1
Parent(s):
53f0197
Always add gmask in token ids
Browse files- tokenization_chatglm.py +3 -14
tokenization_chatglm.py
CHANGED
|
@@ -326,22 +326,11 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
| 326 |
Returns:
|
| 327 |
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
|
| 328 |
"""
|
| 329 |
-
|
| 330 |
-
gmask_ids = self.sp_tokenizer[self.gmask_token]
|
| 331 |
eos_id = self.sp_tokenizer[self.eos_token]
|
| 332 |
-
|
| 333 |
-
token_ids_0 += [gmask_ids]
|
| 334 |
-
|
| 335 |
-
if token_ids_0[-1] != mask_ids and token_ids_0[-1] != gmask_ids:
|
| 336 |
-
token_ids_0 += [self.sp_tokenizer[self.end_token]]
|
| 337 |
-
|
| 338 |
-
token_ids_0 += [self.sp_tokenizer[self.bos_token]]
|
| 339 |
-
|
| 340 |
if token_ids_1 is not None:
|
| 341 |
-
|
| 342 |
-
token_ids_1 += [eos_id]
|
| 343 |
-
token_ids_0 += token_ids_1
|
| 344 |
-
|
| 345 |
return token_ids_0
|
| 346 |
|
| 347 |
def _pad(
|
|
|
|
| 326 |
Returns:
|
| 327 |
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
|
| 328 |
"""
|
| 329 |
+
gmask_id = self.sp_tokenizer[self.gmask_token]
|
|
|
|
| 330 |
eos_id = self.sp_tokenizer[self.eos_token]
|
| 331 |
+
token_ids_0 = token_ids_0 + [gmask_id, self.sp_tokenizer[self.bos_token]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
if token_ids_1 is not None:
|
| 333 |
+
token_ids_0 = token_ids_0 + token_ids_1 + [eos_id]
|
|
|
|
|
|
|
|
|
|
| 334 |
return token_ids_0
|
| 335 |
|
| 336 |
def _pad(
|