Spaces:
Running
Running
File size: 4,275 Bytes
3dd84f8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
import re
from typing import Dict, List
from pypinyin import lazy_pinyin, Style
from .custom_pypinyin_dict import phrase_pinyin_data
import jieba
from .cn2an import an2cn
# 加载自定义拼音词典数据
phrase_pinyin_data.load()
# 标点符号正则
PUNC_MAP: Dict[str, str] = {
":": ",",
";": ",",
",": ",",
"。": ".",
"!": "!",
"?": "?",
"\n": ".",
"·": ",",
"、": ",",
"$": ".",
"/": ",",
"“": "'",
"”": "'",
'"': "'",
"‘": "'",
"’": "'",
"(": "'",
")": "'",
"(": "'",
")": "'",
"《": "'",
"》": "'",
"【": "'",
"】": "'",
"[": "'",
"]": "'",
"—": "-",
"~": "~",
"「": "'",
"」": "'",
"『": "'",
"』": "'",
}
# from GPT_SoVITS.text.zh_normalization.text_normlization
PUNC_MAP.update ({
'/': '每',
'①': '一',
'②': '二',
'③': '三',
'④': '四',
'⑤': '五',
'⑥': '六',
'⑦': '七',
'⑧': '八',
'⑨': '九',
'⑩': '十',
'α': '阿尔法',
'β': '贝塔',
'γ': '伽玛',
'Γ': '伽玛',
'δ': '德尔塔',
'Δ': '德尔塔',
'ε': '艾普西龙',
'ζ': '捷塔',
'η': '依塔',
'θ': '西塔',
'Θ': '西塔',
'ι': '艾欧塔',
'κ': '喀帕',
'λ': '拉姆达',
'Λ': '拉姆达',
'μ': '缪',
'ν': '拗',
'ξ': '克西',
'Ξ': '克西',
'ο': '欧米克伦',
'π': '派',
'Π': '派',
'ρ': '肉',
'ς': '西格玛',
'σ': '西格玛',
'Σ': '西格玛',
'τ': '套',
'υ': '宇普西龙',
'φ': '服艾',
'Φ': '服艾',
'χ': '器',
'ψ': '普赛',
'Ψ': '普赛',
'ω': '欧米伽',
'Ω': '欧米伽',
'+': '加',
'-': '减',
'×': '乘',
'÷': '除',
'=': '等',
"嗯": "恩",
"呣": "母"
})
PUNC_TABLE = str.maketrans(PUNC_MAP)
# 数字正则化
NUMBER_PATTERN: re.Pattern = re.compile(r'\d+(?:\.?\d+)?')
# 阿拉伯数字转汉字
def replace_number(match: re.Match) -> str:
return an2cn(match.group())
def normalize_number(text: str) -> str:
return NUMBER_PATTERN.sub(replace_number, text)
# get symbols of phones, not used
def load_pinyin_symbols(path):
pinyin_dict={}
temp = []
with open(path, "r", encoding='utf-8') as f:
content = f.readlines()
for line in content:
cuts = line.strip().split(',')
pinyin = cuts[0]
phones = cuts[1].split(' ')
pinyin_dict[pinyin] = phones
temp.extend(phones)
temp = list(set(temp))
tone = []
for phone in temp:
for i in range(1, 6):
phone2 = phone + str(i)
tone.append(phone2)
print(sorted(tone, key=lambda x: len(x)))
return pinyin_dict
def load_pinyin_dict(path: str) -> Dict[str, List[str]]:
pinyin_dict = {}
with open(path, "r", encoding='utf-8') as f:
for line in f:
key, value = line.strip().split(',', 1)
pinyin_dict[key] = value.split()
return pinyin_dict
import os
pinyin_dict = load_pinyin_dict(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'cnm3', 'ds_CNM3.txt'))
# pinyin_dict = load_pinyin_dict('text/cnm3/ds_CNM3.txt')
def chinese_to_cnm3(text: str) -> List[str]:
# 标点符号和数字正则化
text = text.translate(PUNC_TABLE)
text = normalize_number(text)
# 过滤掉特殊字符
text = re.sub(r'[#&@“”^_|\\]', '', text)
words = jieba.lcut(text, cut_all=False)
phones = []
for word in words:
pinyin_list: List[str] = lazy_pinyin(word, style=Style.TONE3, neutral_tone_with_five=True)
for pinyin in pinyin_list:
if pinyin[-1].isdigit():
tone = pinyin[-1]
syllable = pinyin[:-1]
phone = pinyin_dict[syllable]
phones.extend([ph + tone for ph in phone])
elif pinyin[-1].isalpha():
pass
else:
phones.extend(pinyin)
return phones |