Support Streaming
Browse files- tokenization_qwen.py +16 -2
tokenization_qwen.py
CHANGED
|
@@ -27,6 +27,12 @@ logger = logging.getLogger(__name__)
|
|
| 27 |
|
| 28 |
|
| 29 |
VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken", "ttf": "SimSun.ttf"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
|
| 32 |
ENDOFTEXT = "<|endoftext|>"
|
|
@@ -147,6 +153,10 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
| 147 |
self.box_end_id = self.special_tokens[self.box_end_tag]
|
| 148 |
self.quad_start_id = self.special_tokens[self.quad_start_tag]
|
| 149 |
self.quad_end_id = self.special_tokens[self.quad_end_tag]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
enc = tiktoken.Encoding(
|
| 152 |
"Qwen",
|
|
@@ -348,7 +358,11 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
| 348 |
token_ids = _replace_closed_tag(token_ids, self.img_start_id, self.img_end_id, _decode_imgurl)
|
| 349 |
|
| 350 |
if skip_special_tokens:
|
| 351 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
return self.tokenizer.decode(token_ids, errors=errors or self.errors)
|
| 353 |
|
| 354 |
def to_list_format(self, text: str):
|
|
@@ -515,7 +529,7 @@ class VisImage:
|
|
| 515 |
class Visualizer:
|
| 516 |
def __init__(self, img_rgb, metadata=None, scale=1.0):
|
| 517 |
self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
|
| 518 |
-
self.font_path =
|
| 519 |
self.output = VisImage(self.img, scale=scale)
|
| 520 |
self.cpu_device = torch.device("cpu")
|
| 521 |
|
|
|
|
| 27 |
|
| 28 |
|
| 29 |
VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken", "ttf": "SimSun.ttf"}
|
| 30 |
+
FONT_PATH = try_to_load_from_cache("Qwen/Qwen-VL-Chat", "SimSun.ttf")
|
| 31 |
+
if FONT_PATH is None:
|
| 32 |
+
if not os.path.exists("SimSun.ttf"):
|
| 33 |
+
ttf = requests.get("https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/SimSun.ttf")
|
| 34 |
+
open("SimSun.ttf", "wb").write(ttf.content)
|
| 35 |
+
FONT_PATH = "SimSun.ttf"
|
| 36 |
|
| 37 |
PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
|
| 38 |
ENDOFTEXT = "<|endoftext|>"
|
|
|
|
| 153 |
self.box_end_id = self.special_tokens[self.box_end_tag]
|
| 154 |
self.quad_start_id = self.special_tokens[self.quad_start_tag]
|
| 155 |
self.quad_end_id = self.special_tokens[self.quad_end_tag]
|
| 156 |
+
self.image_special_tokens = set([
|
| 157 |
+
self.ref_start_id, self.ref_end_id, self.box_start_id, self.box_end_id,
|
| 158 |
+
self.quad_start_id, self.quad_end_id,
|
| 159 |
+
])
|
| 160 |
|
| 161 |
enc = tiktoken.Encoding(
|
| 162 |
"Qwen",
|
|
|
|
| 358 |
token_ids = _replace_closed_tag(token_ids, self.img_start_id, self.img_end_id, _decode_imgurl)
|
| 359 |
|
| 360 |
if skip_special_tokens:
|
| 361 |
+
if kwargs.get('keep_image_special', False):
|
| 362 |
+
token_ids = [i for i in token_ids if i < self.eod_id
|
| 363 |
+
or i in self.image_special_tokens]
|
| 364 |
+
else:
|
| 365 |
+
token_ids = [i for i in token_ids if i < self.eod_id]
|
| 366 |
return self.tokenizer.decode(token_ids, errors=errors or self.errors)
|
| 367 |
|
| 368 |
def to_list_format(self, text: str):
|
|
|
|
| 529 |
class Visualizer:
|
| 530 |
def __init__(self, img_rgb, metadata=None, scale=1.0):
|
| 531 |
self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
|
| 532 |
+
self.font_path = FONT_PATH
|
| 533 |
self.output = VisImage(self.img, scale=scale)
|
| 534 |
self.cpu_device = torch.device("cpu")
|
| 535 |
|