Spaces:
Running
on
Zero
Running
on
Zero
# Copyright (c) 2024 Alibaba Inc | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import copy | |
import os | |
import re | |
from typing import Iterable, List, Union | |
import numpy as np | |
import torch | |
from inspiremusic.text.abs_tokenizer import AbsTokenizer | |
from transformers import AutoTokenizer | |
def get_tokenizer(tokenizer_name, tokenizer_path): | |
if "qwen" in tokenizer_name: | |
return QwenTokenizer(tokenizer_path,skip_special_tokens=True) | |
else: | |
return None | |
class QwenTokenizer(AbsTokenizer): | |
def __init__( | |
self, | |
token_path: str, | |
skip_special_tokens: bool = True, | |
): | |
super().__init__() | |
# NOTE: non-chat model, all these special tokens keep randomly initialized. | |
special_tokens = { | |
'eos_token': '<|endoftext|>', | |
'pad_token': '<|endoftext|>', | |
'additional_special_tokens': [ | |
'<|im_start|>', '<|im_end|>', '<|endofprompt|>', | |
'[breath]', '<strong>', '</strong>', '[noise]', | |
'[laughter]', '[cough]', '[clucking]', '[accent]', | |
'[quick_breath]', | |
] | |
} | |
self.tokenizer = AutoTokenizer.from_pretrained(token_path) | |
self.tokenizer.add_special_tokens(special_tokens) | |
self.skip_special_tokens = skip_special_tokens | |
def get_vocab_size(self): | |
return self.tokenizer.vocab_size | |
def text2tokens(self, line: str) -> List: | |
tokens = self.tokenizer([line], return_tensors="pt") | |
tokens = tokens["input_ids"][0].cpu().tolist() | |
return tokens | |
def tokens2text(self, tokens) -> str: | |
tokens = torch.tensor(tokens, dtype=torch.int64) | |
text = self.tokenizer.batch_decode([tokens], skip_special_tokens=self.skip_special_tokens)[0] | |
return text | |
def get_qwen_vocab_size(token_type: str): | |
if "qwen1.5" in token_type.lower() or "qwen2.0" in token_type.lower() or "qwen2.5" in token_type.lower(): | |
# 293 for special and extra tokens, including endoftext, im_start, im_end, endofprompt and others in the future. | |
# model.vocab_size = 151936, tokenizer.vocab_size = 151643 | |
# NOTE: the first three special tokens (endoftext, im_start, im_end) are trained in Chat series models, | |
# others are kept in random initialization state. | |
return 151643 + 293 | |
else: | |
raise ValueError(f"Unknown tokenizer {token_type}") |