Spaces:
Running
on
Zero
Running
on
Zero
| # reference: https://huggingface.co/spaces/Naozumi0512/Bert-VITS2-Cantonese-Yue/blob/main/text/chinese.py | |
| import sys | |
| import re | |
| import cn2an | |
| from pyjyutping import jyutping | |
| from text.symbols import punctuation | |
| from text.zh_normalization.text_normlization import TextNormalizer | |
| normalizer = lambda x: cn2an.transform(x, "an2cn") | |
| INITIALS = [ | |
| "aa", | |
| "aai", | |
| "aak", | |
| "aap", | |
| "aat", | |
| "aau", | |
| "ai", | |
| "au", | |
| "ap", | |
| "at", | |
| "ak", | |
| "a", | |
| "p", | |
| "b", | |
| "e", | |
| "ts", | |
| "t", | |
| "dz", | |
| "d", | |
| "kw", | |
| "k", | |
| "gw", | |
| "g", | |
| "f", | |
| "h", | |
| "l", | |
| "m", | |
| "ng", | |
| "n", | |
| "s", | |
| "y", | |
| "w", | |
| "c", | |
| "z", | |
| "j", | |
| "ong", | |
| "on", | |
| "ou", | |
| "oi", | |
| "ok", | |
| "o", | |
| "uk", | |
| "ung", | |
| ] | |
| INITIALS += ["sp", "spl", "spn", "sil"] | |
| rep_map = { | |
| ":": ",", | |
| ";": ",", | |
| ",": ",", | |
| "。": ".", | |
| "!": "!", | |
| "?": "?", | |
| "\n": ".", | |
| "·": ",", | |
| "、": ",", | |
| "...": "…", | |
| "$": ".", | |
| "“": "'", | |
| "”": "'", | |
| '"': "'", | |
| "‘": "'", | |
| "’": "'", | |
| "(": "'", | |
| ")": "'", | |
| "(": "'", | |
| ")": "'", | |
| "《": "'", | |
| "》": "'", | |
| "【": "'", | |
| "】": "'", | |
| "[": "'", | |
| "]": "'", | |
| "—": "-", | |
| "~": "-", | |
| "~": "-", | |
| "「": "'", | |
| "」": "'", | |
| } | |
| def replace_punctuation(text): | |
| # text = text.replace("嗯", "恩").replace("呣", "母") | |
| pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) | |
| replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) | |
| replaced_text = re.sub( | |
| r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text | |
| ) | |
| return replaced_text | |
| def text_normalize(text): | |
| tx = TextNormalizer() | |
| sentences = tx.normalize(text) | |
| dest_text = "" | |
| for sentence in sentences: | |
| dest_text += replace_punctuation(sentence) | |
| return dest_text | |
| punctuation_set=set(punctuation) | |
| def jyuping_to_initials_finals_tones(jyuping_syllables): | |
| initials_finals = [] | |
| tones = [] | |
| word2ph = [] | |
| for syllable in jyuping_syllables: | |
| if syllable in punctuation: | |
| initials_finals.append(syllable) | |
| tones.append(0) | |
| word2ph.append(1) # Add 1 for punctuation | |
| elif syllable == "_": | |
| initials_finals.append(syllable) | |
| tones.append(0) | |
| word2ph.append(1) # Add 1 for underscore | |
| else: | |
| try: | |
| tone = int(syllable[-1]) | |
| syllable_without_tone = syllable[:-1] | |
| except ValueError: | |
| tone = 0 | |
| syllable_without_tone = syllable | |
| for initial in INITIALS: | |
| if syllable_without_tone.startswith(initial): | |
| if syllable_without_tone.startswith("nga"): | |
| initials_finals.extend( | |
| [ | |
| syllable_without_tone[:2], | |
| syllable_without_tone[2:] or syllable_without_tone[-1], | |
| ] | |
| ) | |
| # tones.extend([tone, tone]) | |
| tones.extend([-1, tone]) | |
| word2ph.append(2) | |
| else: | |
| final = syllable_without_tone[len(initial) :] or initial[-1] | |
| initials_finals.extend([initial, final]) | |
| # tones.extend([tone, tone]) | |
| tones.extend([-1, tone]) | |
| word2ph.append(2) | |
| break | |
| assert len(initials_finals) == len(tones) | |
| ###魔改为辅音+带音调的元音 | |
| phones=[] | |
| for a,b in zip(initials_finals,tones): | |
| if(b not in [-1,0]):###防止粤语和普通话重合开头加Y,如果是标点,不加。 | |
| todo="%s%s"%(a,b) | |
| else:todo=a | |
| if(todo not in punctuation_set):todo="Y%s"%todo | |
| phones.append(todo) | |
| # return initials_finals, tones, word2ph | |
| return phones, word2ph | |
| def get_jyutping(text): | |
| jp = jyutping.convert(text) | |
| # print(1111111,jp) | |
| for symbol in punctuation: | |
| jp = jp.replace(symbol, " " + symbol + " ") | |
| jp_array = jp.split() | |
| return jp_array | |
| def get_bert_feature(text, word2ph): | |
| from text import chinese_bert | |
| return chinese_bert.get_bert_feature(text, word2ph) | |
| def g2p(text): | |
| # word2ph = [] | |
| jyuping = get_jyutping(text) | |
| # print(jyuping) | |
| # phones, tones, word2ph = jyuping_to_initials_finals_tones(jyuping) | |
| phones, word2ph = jyuping_to_initials_finals_tones(jyuping) | |
| # phones = ["_"] + phones + ["_"] | |
| # tones = [0] + tones + [0] | |
| # word2ph = [1] + word2ph + [1] | |
| return phones, word2ph | |
| if __name__ == "__main__": | |
| # text = "啊!但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏" | |
| text = "佢個鋤頭太短啦。" | |
| text = text_normalize(text) | |
| # phones, tones, word2ph = g2p(text) | |
| phones, word2ph = g2p(text) | |
| # print(phones, tones, word2ph) | |
| print(phones, word2ph) |