Spaces:

KdaiP
/

StableTTS1.1

Running

File size: 4,275 Bytes

3dd84f8

import re
from typing import Dict, List
from pypinyin import lazy_pinyin, Style
from .custom_pypinyin_dict import phrase_pinyin_data
import jieba
from .cn2an import an2cn

# 加载自定义拼音词典数据
phrase_pinyin_data.load()

# 标点符号正则
PUNC_MAP: Dict[str, str] = {
    "：": ",",
    "；": ",",
    "，": ",",
    "。": ".",
    "！": "!",
    "？": "?",
    "\n": ".",
    "·": ",",
    "、": ",",
    "$": ".",
    "/": ",",
    "“": "'",
    "”": "'",
    '"': "'",
    "‘": "'",
    "’": "'",
    "（": "'",
    "）": "'",
    "(": "'",
    ")": "'",
    "《": "'",
    "》": "'",
    "【": "'",
    "】": "'",
    "[": "'",
    "]": "'",
    "—": "-",
    "～": "~",
    "「": "'",
    "」": "'",
    "『": "'", 
    "』": "'",
}

# from GPT_SoVITS.text.zh_normalization.text_normlization
PUNC_MAP.update ({
    '/': '每',
    '①': '一',
    '②': '二',
    '③': '三',
    '④': '四',
    '⑤': '五',
    '⑥': '六',
    '⑦': '七',
    '⑧': '八',
    '⑨': '九',
    '⑩': '十',
    'α': '阿尔法',
    'β': '贝塔',
    'γ': '伽玛',
    'Γ': '伽玛',
    'δ': '德尔塔',
    'Δ': '德尔塔',
    'ε': '艾普西龙',
    'ζ': '捷塔',
    'η': '依塔',
    'θ': '西塔',
    'Θ': '西塔',
    'ι': '艾欧塔',
    'κ': '喀帕',
    'λ': '拉姆达',
    'Λ': '拉姆达',
    'μ': '缪',
    'ν': '拗',
    'ξ': '克西',
    'Ξ': '克西',
    'ο': '欧米克伦',
    'π': '派',
    'Π': '派',
    'ρ': '肉',
    'ς': '西格玛',
    'σ': '西格玛',
    'Σ': '西格玛',
    'τ': '套',
    'υ': '宇普西龙',
    'φ': '服艾',
    'Φ': '服艾',
    'χ': '器',
    'ψ': '普赛',
    'Ψ': '普赛',
    'ω': '欧米伽',
    'Ω': '欧米伽',
    '+': '加',
    '-': '减',
    '×': '乘',
    '÷': '除',
    '=': '等',
    
    "嗯": "恩",
    "呣": "母"
})

PUNC_TABLE = str.maketrans(PUNC_MAP)

# 数字正则化
NUMBER_PATTERN: re.Pattern = re.compile(r'\d+(?:\.?\d+)?')

# 阿拉伯数字转汉字
def replace_number(match: re.Match) -> str:
    return an2cn(match.group())

def normalize_number(text: str) -> str:
    return NUMBER_PATTERN.sub(replace_number, text)

# get symbols of phones, not used
def load_pinyin_symbols(path):
    pinyin_dict={}
    temp = []
    with open(path, "r", encoding='utf-8') as f:
        content = f.readlines()
    for line in content:
        cuts = line.strip().split(',')
        pinyin = cuts[0]
        phones = cuts[1].split(' ')
        pinyin_dict[pinyin] = phones
        temp.extend(phones)
    temp = list(set(temp))
    tone = []
    for phone in temp:
        for i in range(1, 6):
            phone2 = phone + str(i)
            tone.append(phone2)
    print(sorted(tone, key=lambda x: len(x)))
    return pinyin_dict

def load_pinyin_dict(path: str) -> Dict[str, List[str]]:
    pinyin_dict = {}
    with open(path, "r", encoding='utf-8') as f:
        for line in f:
            key, value = line.strip().split(',', 1)
            pinyin_dict[key] = value.split()
    return pinyin_dict

import os
pinyin_dict = load_pinyin_dict(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'cnm3', 'ds_CNM3.txt'))
# pinyin_dict = load_pinyin_dict('text/cnm3/ds_CNM3.txt')

def chinese_to_cnm3(text: str) -> List[str]:
    # 标点符号和数字正则化
    text = text.translate(PUNC_TABLE)
    text = normalize_number(text)
    # 过滤掉特殊字符
    text = re.sub(r'[#&@“”^_|\\]', '', text)
    
    words = jieba.lcut(text, cut_all=False)
    
    phones = []
    for word in words:
        pinyin_list: List[str] = lazy_pinyin(word, style=Style.TONE3, neutral_tone_with_five=True)
        for pinyin in pinyin_list:
            if pinyin[-1].isdigit():
                tone = pinyin[-1]
                syllable = pinyin[:-1]
                phone = pinyin_dict[syllable]
                phones.extend([ph + tone for ph in phone])
            elif pinyin[-1].isalpha():
                pass
            else:
                phones.extend(pinyin)
                
    return phones