|
TASK_TOKEN_MAP = { |
|
"vc": "<|task_vc|>", |
|
"tts": "<|task_tts|>", |
|
"asr": "<|task_asr|>", |
|
"s2s": "<|task_s2s|>", |
|
"t2s": "<|task_t2s|>", |
|
"understand": "<|task_understand|>", |
|
"caption": "<|task_cap|>", |
|
"controllable_tts": "<|task_controllable_tts|>", |
|
"prompt_tts": "<|task_prompt_tts|>", |
|
"speech_edit": "<|task_edit|>", |
|
} |
|
|
|
LEVELS_MAP = { |
|
"very_low": 0, |
|
"low": 1, |
|
"moderate": 2, |
|
"high": 3, |
|
"very_high": 4, |
|
} |
|
|
|
LEVELS_MAP_UI = { |
|
1: 'very_low', |
|
2: 'low', |
|
3: 'moderate', |
|
4: 'high', |
|
5: 'very_high' |
|
} |
|
|
|
GENDER_MAP = { |
|
"female": 0, |
|
"male": 1, |
|
} |
|
|
|
AGE_MAP = {"Child": 0, "Teenager": 1, "Youth-Adult": 2, "Middle-aged": 3, "Elderly": 4} |
|
|
|
EMO_MAP = { |
|
"UNKNOWN": 0, |
|
"NEUTRAL": 1, |
|
"ANGRY": 2, |
|
"HAPPY": 3, |
|
"SAD": 4, |
|
"FEARFUL": 5, |
|
"DISGUSTED": 6, |
|
"SURPRISED": 7, |
|
"SARCASTIC": 8, |
|
"EXCITED": 9, |
|
"SLEEPY": 10, |
|
"CONFUSED": 11, |
|
"EMPHASIS": 12, |
|
"LAUGHING": 13, |
|
"SINGING": 14, |
|
"WORRIED": 15, |
|
"WHISPER": 16, |
|
"ANXIOUS": 17, |
|
"NO-AGREEMENT": 18, |
|
"APOLOGETIC": 19, |
|
"CONCERNED": 20, |
|
"ENUNCIATED": 21, |
|
"ASSERTIVE": 22, |
|
"ENCOURAGING": 23, |
|
"CONTEMPT": 24, |
|
} |
|
|
|
|
|
class TokenParser: |
|
"""Turn label to special token""" |
|
|
|
def __init__(self): |
|
pass |
|
|
|
"""Parse the attributes of a person.""" |
|
|
|
def __init__(self): |
|
pass |
|
|
|
@staticmethod |
|
def age(age: str) -> str: |
|
"""Turn age token.""" |
|
age_id = AGE_MAP[age] |
|
return f"<|age_{age_id}|>" |
|
|
|
@staticmethod |
|
def gender(gender: str) -> str: |
|
"""Turn gender token.""" |
|
gender_id = GENDER_MAP[gender] |
|
return f"<|gender_{gender_id}|>" |
|
|
|
@staticmethod |
|
def mel_value(mel: int): |
|
"""Turn special token of mel scale pitch.""" |
|
mel = max(0, int(mel)) |
|
mel = min(1000, int(mel)) |
|
return f"<|pitch_value_{mel}|>" |
|
|
|
@staticmethod |
|
def mel_level(level: str): |
|
"""Turn special token of mel level.""" |
|
level_tag = LEVELS_MAP[level] |
|
return f"<|pitch_label_{level_tag}|>" |
|
|
|
@staticmethod |
|
def pitch_var_value(pitch_std: int): |
|
"""Turn special token of pitch_std value.""" |
|
assert isinstance(pitch_std, int) |
|
pitch_std = max(0, int(pitch_std)) |
|
pitch_std = min(10, int(pitch_std)) |
|
return f"<|pitch_var_value_{pitch_std}|>" |
|
|
|
@staticmethod |
|
def pitch_var_level(level: str): |
|
"""Turn special token of pitch std level.""" |
|
level_tag = LEVELS_MAP[level] |
|
return f"<|pitch_var_label_{level_tag}|>" |
|
|
|
@staticmethod |
|
def loudness_value(loudness: int): |
|
"""Turn special toak of loudness value [0, 30]""" |
|
assert loudness >= 0 |
|
loudness = max(0, int(loudness)) |
|
loudness = min(30, int(loudness)) |
|
return f"<|loudness_value_{loudness}|>" |
|
|
|
@staticmethod |
|
def loudness_level(level: str): |
|
"""Turn special token of loudness level.""" |
|
level_tag = LEVELS_MAP[level] |
|
return f"<|loudness_label_{level_tag}|>" |
|
|
|
@staticmethod |
|
def speed_value(speed: int): |
|
"""Turn special token of speed value.""" |
|
speed = max(0, int(speed)) |
|
speed = min(10, int(speed)) |
|
return f"<|speed_value_{speed}|>" |
|
|
|
@staticmethod |
|
def speed_level(level: str): |
|
"""Turn special token of speed level.""" |
|
level_tag = LEVELS_MAP[level] |
|
return f"<|speed_label_{level_tag}|>" |
|
|
|
@staticmethod |
|
def task(task: str) -> str: |
|
"""Turn special token of task.""" |
|
assert task in TASK_TOKEN_MAP.keys() |
|
|
|
return TASK_TOKEN_MAP[task] |
|
|
|
@staticmethod |
|
def emotion(emotion: str): |
|
emo_id = EMO_MAP[emotion] |
|
|
|
return f"<|emotion_{emo_id}|>" |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
from transformers import AutoTokenizer |
|
|
|
tokenizer = AutoTokenizer.from_pretrained( |
|
"/aifs4su/xinshengwang/code/StyleCraft/tokenizer/stylecraft-bicodec-pitch-loudness-speed-emotion-tokenizer" |
|
) |
|
|
|
tasks = ["tts", "tts", "understand", "controllable_tts", "prompt_tts"] |
|
ages = ["Child", "Teenager", "Youth-Adult", "Middle-aged", "Elderly"] |
|
genders = ["female", "female", "female", "male", "male"] |
|
mels = [100, 200, 300, 400, 500] |
|
mel_levels = ["very_low", "low", "moderate", "high", "very_high"] |
|
loudnesses = [1, 10, 23, 19, 30] |
|
loudness_levels = ["very_low", "low", "moderate", "high", "very_high"] |
|
emotions = ["UNKNOWN", "NEUTRAL", "ANGRY", "HAPPY", "SAD"] |
|
|
|
for i in range(5): |
|
task = TokenParser.task(tasks[i]) |
|
age = TokenParser.age(ages[i]) |
|
gender = TokenParser.gender(genders[i]) |
|
mel = TokenParser.mel_value(mels[i]) |
|
mel_level = TokenParser.mel_level(mel_levels[i]) |
|
loudness = TokenParser.loudness_value(loudnesses[i]) |
|
loudness_level = TokenParser.loudness_level(loudness_levels[i]) |
|
emotion = TokenParser.emotion(emotions[i]) |
|
inputs = [task, age, gender, mel, mel_level, loudness, loudness_level, emotion] |
|
inputs = "".join(inputs) |
|
ids = tokenizer.encode(inputs, add_special_tokens=False) |
|
print(ids) |
|
print("decode", tokenizer.decode(ids)) |
|
|