Qwen
/

Qwen-VL-Chat-Int4

@@ -27,6 +27,12 @@ logger = logging.getLogger(__name__)
 VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken", "ttf": "SimSun.ttf"}
 PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
 ENDOFTEXT = "<|endoftext|>"
@@ -147,6 +153,10 @@ class QWenTokenizer(PreTrainedTokenizer):
         self.box_end_id = self.special_tokens[self.box_end_tag]
         self.quad_start_id = self.special_tokens[self.quad_start_tag]
         self.quad_end_id = self.special_tokens[self.quad_end_tag]
         enc = tiktoken.Encoding(
             "Qwen",
@@ -348,7 +358,11 @@ class QWenTokenizer(PreTrainedTokenizer):
         token_ids = _replace_closed_tag(token_ids, self.img_start_id, self.img_end_id, _decode_imgurl)
         if skip_special_tokens:
-            token_ids = [i for i in token_ids if i < self.eod_id]
         return self.tokenizer.decode(token_ids, errors=errors or self.errors)
     def to_list_format(self, text: str):
@@ -515,7 +529,7 @@ class VisImage:
 class Visualizer:
     def __init__(self, img_rgb, metadata=None, scale=1.0):
         self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
-        self.font_path = try_to_load_from_cache("Qwen/Qwen-VL-Chat", "SimSun.ttf")
         self.output = VisImage(self.img, scale=scale)
         self.cpu_device = torch.device("cpu")

 VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken", "ttf": "SimSun.ttf"}
+FONT_PATH = try_to_load_from_cache("Qwen/Qwen-VL-Chat", "SimSun.ttf")
+if FONT_PATH is None:
+    if not os.path.exists("SimSun.ttf"):
+        ttf = requests.get("https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/SimSun.ttf")
+        open("SimSun.ttf", "wb").write(ttf.content)
+    FONT_PATH = "SimSun.ttf"
 PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
 ENDOFTEXT = "<|endoftext|>"
         self.box_end_id = self.special_tokens[self.box_end_tag]
         self.quad_start_id = self.special_tokens[self.quad_start_tag]
         self.quad_end_id = self.special_tokens[self.quad_end_tag]
+        self.image_special_tokens = set([
+            self.ref_start_id, self.ref_end_id, self.box_start_id, self.box_end_id,
+            self.quad_start_id, self.quad_end_id,
+        ])
         enc = tiktoken.Encoding(
             "Qwen",
         token_ids = _replace_closed_tag(token_ids, self.img_start_id, self.img_end_id, _decode_imgurl)
         if skip_special_tokens:
+            if kwargs.get('keep_image_special', False):
+                token_ids = [i for i in token_ids if i < self.eod_id
+                    or i in self.image_special_tokens]
+            else:
+                token_ids = [i for i in token_ids if i < self.eod_id]
         return self.tokenizer.decode(token_ids, errors=errors or self.errors)
     def to_list_format(self, text: str):
 class Visualizer:
     def __init__(self, img_rgb, metadata=None, scale=1.0):
         self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
+        self.font_path = FONT_PATH
         self.output = VisImage(self.img, scale=scale)
         self.cpu_device = torch.device("cpu")