add more tokenizers
Browse files- vocab/README.md +2 -0
- vocab/__init__.py +13 -2
- vocab/chatglm_6b/__init__.py +8 -7
- vocab/code_davinci_002/__init__.py +3 -0
- vocab/deepseek_coder_33b_instruct/__init__.py +7 -0
- vocab/deepseek_llm_7b_base/__init__.py +5 -0
- vocab/gpt_35_turbo/__init__.py +0 -2
- vocab/text_davinci_003/__init__.py +70 -0
- vocab/tigerbot_13b_chat_v2/__init__.py +5 -0
- vocab/tigerbot_70b_chat_v4_4k/__init__.py +5 -0
- vocab/wizardcoder_15b_v1/__init__.py +4 -0
- vocab/wizardcoder_python_7b_v1/__init__.py +4 -0
- vocab/wizardlm_7b_v1/__init__.py +4 -0
- vocab/wizardmath_70b_v1/__init__.py +4 -0
vocab/README.md
CHANGED
|
@@ -1,4 +1,6 @@
|
|
| 1 |
|
|
|
|
|
|
|
| 2 |
对于OpenAI的模型而言,英文的Token效率是中文的8-12倍,
|
| 3 |
之前三百字中文以上时Turbo 3.5 16k就会出现逻辑颠倒问题,提示词换成英文后该问题没有出现过。
|
| 4 |
|
|
|
|
| 1 |
|
| 2 |
+
https://arxiv.org/abs/2308.16692 SpeechTokenizer
|
| 3 |
+
|
| 4 |
对于OpenAI的模型而言,英文的Token效率是中文的8-12倍,
|
| 5 |
之前三百字中文以上时Turbo 3.5 16k就会出现逻辑颠倒问题,提示词换成英文后该问题没有出现过。
|
| 6 |
|
vocab/__init__.py
CHANGED
|
@@ -55,8 +55,6 @@ uniq_tokenizers = [
|
|
| 55 |
all_tokenizers = [
|
| 56 |
"gpt2",
|
| 57 |
"gpt2_chinese",
|
| 58 |
-
"gpt_35_turbo",
|
| 59 |
-
"gpt_4",
|
| 60 |
|
| 61 |
# bert 系列
|
| 62 |
"bert_base_cased",
|
|
@@ -105,6 +103,10 @@ all_tokenizers = [
|
|
| 105 |
"qwen_1_8b_chat",
|
| 106 |
"qwen_7b_chat",
|
| 107 |
"qwen_72b_chat",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
# 未分类
|
| 110 |
"skywork_13b_base",
|
|
@@ -116,6 +118,15 @@ all_tokenizers = [
|
|
| 116 |
"flan_t5_base",
|
| 117 |
"fastchat_t5_3b",
|
| 118 |
"pko_t5_large",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
|
| 121 |
]
|
|
|
|
| 55 |
all_tokenizers = [
|
| 56 |
"gpt2",
|
| 57 |
"gpt2_chinese",
|
|
|
|
|
|
|
| 58 |
|
| 59 |
# bert 系列
|
| 60 |
"bert_base_cased",
|
|
|
|
| 103 |
"qwen_1_8b_chat",
|
| 104 |
"qwen_7b_chat",
|
| 105 |
"qwen_72b_chat",
|
| 106 |
+
"text_davinci_003",
|
| 107 |
+
"code_davinci_002",
|
| 108 |
+
"gpt_35_turbo",
|
| 109 |
+
"gpt_4",
|
| 110 |
|
| 111 |
# 未分类
|
| 112 |
"skywork_13b_base",
|
|
|
|
| 118 |
"flan_t5_base",
|
| 119 |
"fastchat_t5_3b",
|
| 120 |
"pko_t5_large",
|
| 121 |
+
"wizardcoder_15b_v1",
|
| 122 |
+
"wizardcoder_python_7b_v1",
|
| 123 |
+
"wizardlm_7b_v1",
|
| 124 |
+
"wizardmath_70b_v1",
|
| 125 |
+
"tigerbot_70b_chat_v4_4k",
|
| 126 |
+
"tigerbot_13b_chat_v2",
|
| 127 |
+
"deepseek_coder_33b_instruct",
|
| 128 |
+
"deepseek_llm_7b_base",
|
| 129 |
+
|
| 130 |
|
| 131 |
|
| 132 |
]
|
vocab/chatglm_6b/__init__.py
CHANGED
|
@@ -6,15 +6,16 @@ import os
|
|
| 6 |
import config
|
| 7 |
from transformers import AutoTokenizer
|
| 8 |
|
| 9 |
-
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
| 10 |
|
| 11 |
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
| 18 |
|
| 19 |
# https://huggingface.co/THUDM/chatglm-6b/blob/main/tokenization_chatglm.py#L153
|
| 20 |
tokenizer.comments = f"num_image_tokens: {tokenizer.sp_tokenizer.num_image_tokens}; num_image_tokens: {tokenizer.sp_tokenizer.num_text_tokens} "
|
|
|
|
| 6 |
import config
|
| 7 |
from transformers import AutoTokenizer
|
| 8 |
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
+
|
| 12 |
+
# if config.USE_REMOTE:
|
| 13 |
+
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
|
| 14 |
+
# else:
|
| 15 |
+
# os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
| 16 |
+
# CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 17 |
+
# TOKENIZER_DIR = os.path.join(CURRENT_DIR, "chatglm_6b")
|
| 18 |
+
# tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
|
| 19 |
|
| 20 |
# https://huggingface.co/THUDM/chatglm-6b/blob/main/tokenization_chatglm.py#L153
|
| 21 |
tokenizer.comments = f"num_image_tokens: {tokenizer.sp_tokenizer.num_image_tokens}; num_image_tokens: {tokenizer.sp_tokenizer.num_text_tokens} "
|
vocab/code_davinci_002/__init__.py
CHANGED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
+
from vocab.text_davinci_003 import tokenizer
|
vocab/deepseek_coder_33b_instruct/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
https://huggingface.co/spaces/deepseek-ai/deepseek-coder-7b-instruct
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from transformers import AutoTokenizer
|
| 6 |
+
|
| 7 |
+
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-33b-instruct", trust_remote_code=True)
|
vocab/deepseek_llm_7b_base/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
+
from transformers import AutoTokenizer
|
| 4 |
+
|
| 5 |
+
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-llm-7b-base", trust_remote_code=True)
|
vocab/gpt_35_turbo/__init__.py
CHANGED
|
@@ -42,8 +42,6 @@ def get_vocab(self, token_type="str"):
|
|
| 42 |
key_error_list = []
|
| 43 |
unicode_decode_error_list = []
|
| 44 |
for i in range(self.vocab_size):
|
| 45 |
-
if i == 100256:
|
| 46 |
-
print(i)
|
| 47 |
try:
|
| 48 |
token_byte = self.convert_ids_to_tokens([i])[0]
|
| 49 |
if token_byte is None:
|
|
|
|
| 42 |
key_error_list = []
|
| 43 |
unicode_decode_error_list = []
|
| 44 |
for i in range(self.vocab_size):
|
|
|
|
|
|
|
| 45 |
try:
|
| 46 |
token_byte = self.convert_ids_to_tokens([i])[0]
|
| 47 |
if token_byte is None:
|
vocab/text_davinci_003/__init__.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
TODO
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import tiktoken
|
| 6 |
+
from tiktoken import Encoding
|
| 7 |
+
from utils.log_util import logger
|
| 8 |
+
|
| 9 |
+
tokenizer = tiktoken.encoding_for_model('text-davinci-003')
|
| 10 |
+
tokenizer.vocab_size = tokenizer.n_vocab
|
| 11 |
+
|
| 12 |
+
tokenizer.comments = ""
|
| 13 |
+
tokenizer.reversible = True
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def decode(self, tokens, errors="replace", skip_special_tokens=False):
|
| 19 |
+
"""
|
| 20 |
+
默认的decode,可能会报错,详见 decode_test.py
|
| 21 |
+
skip_special_tokens 是为了兼容 hf_tokenizer
|
| 22 |
+
"""
|
| 23 |
+
try:
|
| 24 |
+
decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
|
| 25 |
+
except:
|
| 26 |
+
decode_str = "null"
|
| 27 |
+
return decode_str
|
| 28 |
+
|
| 29 |
+
def convert_ids_to_tokens(self, tokens, skip_special_tokens=False):
|
| 30 |
+
"""
|
| 31 |
+
为什么没有这个方法?
|
| 32 |
+
"""
|
| 33 |
+
try:
|
| 34 |
+
return tokenizer.decode_tokens_bytes(tokens)
|
| 35 |
+
except:
|
| 36 |
+
# 什么要返回None?见zh_util.py
|
| 37 |
+
# 16个空闲id, 100256 100261-100275
|
| 38 |
+
return [None for token in tokens]
|
| 39 |
+
|
| 40 |
+
def get_vocab(self, token_type="str"):
|
| 41 |
+
"""Returns vocab as a dict
|
| 42 |
+
:param token_type: ["str", "byte"]
|
| 43 |
+
:return:
|
| 44 |
+
"""
|
| 45 |
+
vocab = {}
|
| 46 |
+
key_error_list = []
|
| 47 |
+
unicode_decode_error_list = []
|
| 48 |
+
for i in range(self.vocab_size):
|
| 49 |
+
try:
|
| 50 |
+
token_byte = self.convert_ids_to_tokens([i])[0]
|
| 51 |
+
if token_byte is None:
|
| 52 |
+
continue
|
| 53 |
+
# token_str = token_byte.decode("utf-8")
|
| 54 |
+
vocab[token_byte] = i
|
| 55 |
+
|
| 56 |
+
except UnicodeDecodeError: # 773 UnicodeDecodeError
|
| 57 |
+
unicode_decode_error_list.append((i, str(token_byte)))
|
| 58 |
+
vocab[token_byte] = i
|
| 59 |
+
|
| 60 |
+
# vocab.update(self.added_tokens_encoder)
|
| 61 |
+
logger.info(f"text-davinci-003 {len(key_error_list)} KeyError: {key_error_list}")
|
| 62 |
+
logger.info(f"text-davinci-003 {len(unicode_decode_error_list)} UnicodeDecodeError: {unicode_decode_error_list[:5]}")
|
| 63 |
+
return vocab
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# tiktoken patch
|
| 68 |
+
Encoding.decode = decode
|
| 69 |
+
Encoding.convert_ids_to_tokens = convert_ids_to_tokens
|
| 70 |
+
Encoding.get_vocab = get_vocab
|
vocab/tigerbot_13b_chat_v2/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
+
from transformers import AutoTokenizer
|
| 4 |
+
|
| 5 |
+
tokenizer = AutoTokenizer.from_pretrained("TigerResearch/tigerbot-13b-chat-v2", trust_remote_code=True)
|
vocab/tigerbot_70b_chat_v4_4k/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
+
from transformers import AutoTokenizer
|
| 4 |
+
|
| 5 |
+
tokenizer = AutoTokenizer.from_pretrained("TigerResearch/tigerbot-70b-chat-v4-4k", trust_remote_code=True)
|
vocab/wizardcoder_15b_v1/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from transformers import AutoTokenizer
|
| 3 |
+
|
| 4 |
+
tokenizer = AutoTokenizer.from_pretrained("WizardLM/WizardCoder-15B-V1.0", trust_remote_code=True)
|
vocab/wizardcoder_python_7b_v1/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from transformers import AutoTokenizer
|
| 3 |
+
|
| 4 |
+
tokenizer = AutoTokenizer.from_pretrained("WizardLM/WizardCoder-Python-7B-V1.0", trust_remote_code=True)
|
vocab/wizardlm_7b_v1/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from transformers import AutoTokenizer
|
| 3 |
+
|
| 4 |
+
tokenizer = AutoTokenizer.from_pretrained("WizardLM/WizardLM-7B-V1.0", trust_remote_code=True)
|
vocab/wizardmath_70b_v1/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from transformers import AutoTokenizer
|
| 3 |
+
|
| 4 |
+
tokenizer = AutoTokenizer.from_pretrained("WizardLM/WizardMath-70B-V1.0", trust_remote_code=True)
|