ing0's picture
chinese_lexicon
566e204
raw
history blame
18.8 kB
# Copyright (c) 2024 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import re
import jieba
import cn2an
from pypinyin import lazy_pinyin, BOPOMOFO
from typing import List
from diffrhythm.g2p.g2p.chinese_model_g2p import BertPolyPredict
from diffrhythm.g2p.utils.front_utils import *
import os
from huggingface_hub import hf_hub_download
# from g2pw import G2PWConverter
# set blank level, {0:"none",1:"char", 2:"word"}
BLANK_LEVEL = 0
# conv = G2PWConverter(style='pinyin', enable_non_tradional_chinese=True)
resource_path = r"./diffrhythm/g2p"
poly_all_class_path = os.path.join(
resource_path, "sources", "g2p_chinese_model", "polychar.txt"
)
if not os.path.exists(poly_all_class_path):
print(
"Incorrect path for polyphonic character class dictionary: {}, please check...".format(
poly_all_class_path
)
)
exit()
poly_dict = generate_poly_lexicon(poly_all_class_path)
# Set up G2PW model parameters
g2pw_poly_model_path = os.path.join(resource_path, "sources", "g2p_chinese_model")
if not os.path.exists(g2pw_poly_model_path):
print(
"Incorrect path for g2pw polyphonic character model: {}, please check...".format(
g2pw_poly_model_path
)
)
exit()
json_file_path = os.path.join(
resource_path, "sources", "g2p_chinese_model", "polydict.json"
)
if not os.path.exists(json_file_path):
print(
"Incorrect path for g2pw id to pinyin dictionary: {}, please check...".format(
json_file_path
)
)
exit()
jsonr_file_path = os.path.join(
resource_path, "sources", "g2p_chinese_model", "polydict_r.json"
)
if not os.path.exists(jsonr_file_path):
print(
"Incorrect path for g2pw pinyin to id dictionary: {}, please check...".format(
jsonr_file_path
)
)
exit()
g2pw_poly_predict = BertPolyPredict(
g2pw_poly_model_path, jsonr_file_path, json_file_path
)
"""
Text clean time
"""
# List of (Latin alphabet, bopomofo) pairs:
_latin_to_bopomofo = [
(re.compile("%s" % x[0], re.IGNORECASE), x[1])
for x in [
("a", "γ„ŸΛ‰"),
("b", "γ„…γ„§Λ‹"),
("c", "γ„™γ„§Λ‰"),
("d", "ㄉㄧˋ"),
("e", "γ„§Λ‹"),
("f", "γ„ΛŠγ„ˆγ„¨Λ‹"),
("g", "ㄐㄧˋ"),
("h", "ㄝˇㄑㄩˋ"),
("i", "γ„žΛ‹"),
("j", "γ„γ„ŸΛ‹"),
("k", "γ„Žγ„ŸΛ‹"),
("l", "γ„ΛŠγ„›Λ‹"),
("m", "γ„ΛŠγ„‡γ„¨Λ‹"),
("n", "γ„£Λ‰"),
("o", "γ„‘Λ‰"),
("p", "ㄆㄧˉ"),
("q", "γ„Žγ„§γ„‘Λ‰"),
("r", "γ„šΛ‹"),
("s", "γ„ΛŠγ„™Λ‹"),
("t", "γ„Šγ„§Λ‹"),
("u", "γ„§γ„‘Λ‰"),
("v", "ㄨㄧˉ"),
("w", "γ„‰γ„šΛ‹γ„…γ„¨Λ‹γ„Œγ„§γ„‘Λ‹"),
("x", "γ„Λ‰γ„Žγ„¨Λ‹γ„™Λ‹"),
("y", "γ„¨γ„žΛ‹"),
("z", "γ„—γ„ŸΛ‹"),
]
]
# List of (bopomofo, ipa) pairs:
_bopomofo_to_ipa = [
(re.compile("%s" % x[0]), x[1])
for x in [
("γ„…γ„›", "p⁼wo"),
("ㄆㄛ", "pΚ°wo"),
("ㄇㄛ", "mwo"),
("γ„ˆγ„›", "fwo"),
("γ„§γ„’", "|jΙ›n"),
("γ„©γ„’", "|Ι₯Γ¦n"),
("γ„§γ„£", "|in"),
("γ„©γ„£", "|Ι₯n"),
("γ„§γ„₯", "|iΕ‹"),
("ㄨγ„₯", "|ΚŠΕ‹"),
("γ„©γ„₯", "|jΚŠΕ‹"),
# Add
("γ„§γ„š", "|ia"),
("ㄧㄝ", "|iΙ›"),
("γ„§γ„ ", "|iΙ‘ΚŠ"),
("γ„§γ„‘", "|ioʊ"),
("γ„§γ„€", "|iΙ‘Ε‹"),
("γ„¨γ„š", "|ua"),
("ㄨㄛ", "|uo"),
("γ„¨γ„ž", "|uaΙͺ"),
("γ„¨γ„Ÿ", "|ueΙͺ"),
("ㄨㄒ", "|uan"),
("ㄨㄣ", "|uΙ™n"),
("ㄨ㄀", "|uΙ‘Ε‹"),
("ㄩㄝ", "|Ι₯Ι›"),
# End
("γ„…", "p⁼"),
("ㄆ", "pΚ°"),
("ㄇ", "m"),
("γ„ˆ", "f"),
("ㄉ", "t⁼"),
("γ„Š", "tΚ°"),
("γ„‹", "n"),
("γ„Œ", "l"),
("ㄍ", "k⁼"),
("γ„Ž", "kΚ°"),
("ㄏ", "x"),
("ㄐ", "tʃ⁼"),
("ㄑ", "tʃʰ"),
("γ„’", "Κƒ"),
("γ„“", "ts`⁼"),
("γ„”", "ts`Κ°"),
("γ„•", "s`"),
("γ„–", "ΙΉ`"),
("γ„—", "ts⁼"),
("γ„˜", "tsΚ°"),
("γ„™", "|s"),
("γ„š", "|a"),
("γ„›", "|o"),
("γ„œ", "|Ι™"),
("ㄝ", "|Ι›"),
("γ„ž", "|aΙͺ"),
("γ„Ÿ", "|eΙͺ"),
("γ„ ", "|Ι‘ΚŠ"),
("γ„‘", "|oʊ"),
("γ„’", "|an"),
("γ„£", "|Ι™n"),
("γ„€", "|Ι‘Ε‹"),
("γ„₯", "|Ι™Ε‹"),
("ㄦ", "Ι™ΙΉ"),
("γ„§", "|i"),
("ㄨ", "|u"),
("γ„©", "|Ι₯"),
("Λ‰", "β†’|"),
("ˊ", "↑|"),
("Λ‡", "↓↑|"),
("Λ‹", "↓|"),
("Λ™", "|"),
]
]
must_not_er_words = {"ε₯³ε„Ώ", "老儿", "η”·ε„Ώ", "ε°‘ε„Ώ", "小儿"}
chinese_lexicon_path = hf_hub_download(
repo_id="ASLP-lab/DiffRhythm",
filename="diffrhythm/g2p/sources/chinese_lexicon.txt",
repo_type="space"
)
word_pinyin_dict = {}
with open(chinese_lexicon_path, "r", encoding="utf-8") as fread:
txt_list = fread.readlines()
for txt in txt_list:
word, pinyin = txt.strip().split("\t")
word_pinyin_dict[word] = pinyin
fread.close()
pinyin_2_bopomofo_dict = {}
with open(
r"./diffrhythm/g2p/sources/pinyin_2_bpmf.txt", "r", encoding="utf-8"
) as fread:
txt_list = fread.readlines()
for txt in txt_list:
pinyin, bopomofo = txt.strip().split("\t")
pinyin_2_bopomofo_dict[pinyin] = bopomofo
fread.close()
tone_dict = {
"0": "Λ™",
"5": "Λ™",
"1": "",
"2": "ˊ",
"3": "Λ‡",
"4": "Λ‹",
}
bopomofos2pinyin_dict = {}
with open(
r"./diffrhythm/g2p/sources/bpmf_2_pinyin.txt", "r", encoding="utf-8"
) as fread:
txt_list = fread.readlines()
for txt in txt_list:
v, k = txt.strip().split("\t")
bopomofos2pinyin_dict[k] = v
fread.close()
def bpmf_to_pinyin(text):
bopomofo_list = text.split("|")
pinyin_list = []
for info in bopomofo_list:
pinyin = ""
for c in info:
if c in bopomofos2pinyin_dict:
pinyin += bopomofos2pinyin_dict[c]
if len(pinyin) == 0:
continue
if pinyin[-1] not in "01234":
pinyin += "1"
if pinyin[:-1] == "ve":
pinyin = "y" + pinyin
if pinyin[:-1] == "sh":
pinyin = pinyin[:-1] + "i" + pinyin[-1]
if pinyin == "sh":
pinyin = pinyin[:-1] + "i"
if pinyin[:-1] == "s":
pinyin = "si" + pinyin[-1]
if pinyin[:-1] == "c":
pinyin = "ci" + pinyin[-1]
if pinyin[:-1] == "i":
pinyin = "yi" + pinyin[-1]
if pinyin[:-1] == "iou":
pinyin = "you" + pinyin[-1]
if pinyin[:-1] == "ien":
pinyin = "yin" + pinyin[-1]
if "iou" in pinyin and pinyin[-4:-1] == "iou":
pinyin = pinyin[:-4] + "iu" + pinyin[-1]
if "uei" in pinyin:
if pinyin[:-1] == "uei":
pinyin = "wei" + pinyin[-1]
elif pinyin[-4:-1] == "uei":
pinyin = pinyin[:-4] + "ui" + pinyin[-1]
if "uen" in pinyin and pinyin[-4:-1] == "uen":
if pinyin[:-1] == "uen":
pinyin = "wen" + pinyin[-1]
elif pinyin[-4:-1] == "uei":
pinyin = pinyin[:-4] + "un" + pinyin[-1]
if "van" in pinyin and pinyin[-4:-1] == "van":
if pinyin[:-1] == "van":
pinyin = "yuan" + pinyin[-1]
elif pinyin[-4:-1] == "van":
pinyin = pinyin[:-4] + "uan" + pinyin[-1]
if "ueng" in pinyin and pinyin[-5:-1] == "ueng":
pinyin = pinyin[:-5] + "ong" + pinyin[-1]
if pinyin[:-1] == "veng":
pinyin = "yong" + pinyin[-1]
if "veng" in pinyin and pinyin[-5:-1] == "veng":
pinyin = pinyin[:-5] + "iong" + pinyin[-1]
if pinyin[:-1] == "ieng":
pinyin = "ying" + pinyin[-1]
if pinyin[:-1] == "u":
pinyin = "wu" + pinyin[-1]
if pinyin[:-1] == "v":
pinyin = "yv" + pinyin[-1]
if pinyin[:-1] == "ing":
pinyin = "ying" + pinyin[-1]
if pinyin[:-1] == "z":
pinyin = "zi" + pinyin[-1]
if pinyin[:-1] == "zh":
pinyin = "zhi" + pinyin[-1]
if pinyin[0] == "u":
pinyin = "w" + pinyin[1:]
if pinyin[0] == "i":
pinyin = "y" + pinyin[1:]
pinyin = pinyin.replace("ien", "in")
pinyin_list.append(pinyin)
return " ".join(pinyin_list)
# Convert numbers to Chinese pronunciation
def number_to_chinese(text):
# numbers = re.findall(r'\d+(?:\.?\d+)?', text)
# for number in numbers:
# text = text.replace(number, cn2an.an2cn(number), 1)
text = cn2an.transform(text, "an2cn")
return text
def normalization(text):
text = text.replace(",", ",")
text = text.replace("。", ".")
text = text.replace("!", "!")
text = text.replace("?", "?")
text = text.replace("οΌ›", ";")
text = text.replace(":", ":")
text = text.replace("、", ",")
text = text.replace("β€˜", "'")
text = text.replace("’", "'")
text = text.replace("β‹―", "…")
text = text.replace("Β·Β·Β·", "…")
text = text.replace("・・・", "…")
text = text.replace("...", "…")
text = re.sub(r"\s+", "", text)
text = re.sub(r"[^\u4e00-\u9fff\s_,\.\?!;:\'…]", "", text)
text = re.sub(r"\s*([,\.\?!;:\'…])\s*", r"\1", text)
return text
def change_tone(bopomofo: str, tone: str) -> str:
if bopomofo[-1] not in "Λ™ΛŠΛ‡Λ‹":
bopomofo = bopomofo + tone
else:
bopomofo = bopomofo[:-1] + tone
return bopomofo
def er_sandhi(word: str, bopomofos: List[str]) -> List[str]:
if len(word) > 1 and word[-1] == "ε„Ώ" and word not in must_not_er_words:
bopomofos[-1] = change_tone(bopomofos[-1], "Λ™")
return bopomofos
def bu_sandhi(word: str, bopomofos: List[str]) -> List[str]:
valid_char = set(word)
if len(valid_char) == 1 and "不" in valid_char:
pass
elif word in ["不字"]:
pass
elif len(word) == 3 and word[1] == "不" and bopomofos[1][:-1] == "ㄅㄨ":
bopomofos[1] = bopomofos[1][:-1] + "Λ™"
else:
for i, char in enumerate(word):
if (
i + 1 < len(bopomofos)
and char == "不"
and i + 1 < len(word)
and 0 < len(bopomofos[i + 1])
and bopomofos[i + 1][-1] == "Λ‹"
):
bopomofos[i] = bopomofos[i][:-1] + "ˊ"
return bopomofos
def yi_sandhi(word: str, bopomofos: List[str]) -> List[str]:
punc = "οΌšοΌŒοΌ›γ€‚οΌŸοΌβ€œβ€β€˜β€™':,;.?!()οΌˆοΌ‰{}【】[]-~`、 "
if word.find("δΈ€") != -1 and any(
[item.isnumeric() for item in word if item != "δΈ€"]
):
for i in range(len(word)):
if (
i == 0
and word[0] == "δΈ€"
and len(word) > 1
and word[1]
not in [
"ι›Ά",
"δΈ€",
"二",
"δΈ‰",
"ε››",
"δΊ”",
"ε…­",
"δΈƒ",
"ε…«",
"九",
"十",
]
):
if len(bopomofos[0]) > 0 and bopomofos[1][-1] in ["Λ‹", "Λ™"]:
bopomofos[0] = change_tone(bopomofos[0], "ˊ")
else:
bopomofos[0] = change_tone(bopomofos[0], "Λ‹")
elif word[i] == "δΈ€":
bopomofos[i] = change_tone(bopomofos[i], "")
return bopomofos
elif len(word) == 3 and word[1] == "δΈ€" and word[0] == word[-1]:
bopomofos[1] = change_tone(bopomofos[1], "Λ™")
elif word.startswith("第一"):
bopomofos[1] = change_tone(bopomofos[1], "")
elif word.startswith("δΈ€ζœˆ") or word.startswith("δΈ€ζ—₯") or word.startswith("一号"):
bopomofos[0] = change_tone(bopomofos[0], "")
else:
for i, char in enumerate(word):
if char == "δΈ€" and i + 1 < len(word):
if (
len(bopomofos) > i + 1
and len(bopomofos[i + 1]) > 0
and bopomofos[i + 1][-1] in {"Λ‹"}
):
bopomofos[i] = change_tone(bopomofos[i], "ˊ")
else:
if word[i + 1] not in punc:
bopomofos[i] = change_tone(bopomofos[i], "Λ‹")
else:
pass
return bopomofos
def merge_bu(seg: List) -> List:
new_seg = []
last_word = ""
for word in seg:
if word != "不":
if last_word == "不":
word = last_word + word
new_seg.append(word)
last_word = word
return new_seg
def merge_er(seg: List) -> List:
new_seg = []
for i, word in enumerate(seg):
if i - 1 >= 0 and word == "ε„Ώ":
new_seg[-1] = new_seg[-1] + seg[i]
else:
new_seg.append(word)
return new_seg
def merge_yi(seg: List) -> List:
new_seg = []
# function 1
for i, word in enumerate(seg):
if (
i - 1 >= 0
and word == "δΈ€"
and i + 1 < len(seg)
and seg[i - 1] == seg[i + 1]
):
if i - 1 < len(new_seg):
new_seg[i - 1] = new_seg[i - 1] + "δΈ€" + new_seg[i - 1]
else:
new_seg.append(word)
new_seg.append(seg[i + 1])
else:
if i - 2 >= 0 and seg[i - 1] == "δΈ€" and seg[i - 2] == word:
continue
else:
new_seg.append(word)
seg = new_seg
new_seg = []
isnumeric_flag = False
for i, word in enumerate(seg):
if all([item.isnumeric() for item in word]) and not isnumeric_flag:
isnumeric_flag = True
new_seg.append(word)
else:
new_seg.append(word)
seg = new_seg
new_seg = []
# function 2
for i, word in enumerate(seg):
if new_seg and new_seg[-1] == "δΈ€":
new_seg[-1] = new_seg[-1] + word
else:
new_seg.append(word)
return new_seg
# Word Segmentation, and convert Chinese pronunciation to pinyin (bopomofo)
def chinese_to_bopomofo(text_short, sentence):
# bopomofos = conv(text_short)
words = jieba.lcut(text_short, cut_all=False)
words = merge_yi(words)
words = merge_bu(words)
words = merge_er(words)
text = ""
char_index = 0
for word in words:
bopomofos = []
if word in word_pinyin_dict and word not in poly_dict:
pinyin = word_pinyin_dict[word]
for py in pinyin.split(" "):
if py[:-1] in pinyin_2_bopomofo_dict and py[-1] in tone_dict:
bopomofos.append(
pinyin_2_bopomofo_dict[py[:-1]] + tone_dict[py[-1]]
)
if BLANK_LEVEL == 1:
bopomofos.append("_")
else:
bopomofos_lazy = lazy_pinyin(word, BOPOMOFO)
bopomofos += bopomofos_lazy
if BLANK_LEVEL == 1:
bopomofos.append("_")
else:
for i in range(len(word)):
c = word[i]
if c in poly_dict:
poly_pinyin = g2pw_poly_predict.predict_process(
[text_short, char_index + i]
)[0]
py = poly_pinyin[2:-1]
bopomofos.append(
pinyin_2_bopomofo_dict[py[:-1]] + tone_dict[py[-1]]
)
if BLANK_LEVEL == 1:
bopomofos.append("_")
elif c in word_pinyin_dict:
py = word_pinyin_dict[c]
bopomofos.append(
pinyin_2_bopomofo_dict[py[:-1]] + tone_dict[py[-1]]
)
if BLANK_LEVEL == 1:
bopomofos.append("_")
else:
bopomofos.append(c)
if BLANK_LEVEL == 1:
bopomofos.append("_")
if BLANK_LEVEL == 2:
bopomofos.append("_")
char_index += len(word)
if (
len(word) == 3
and bopomofos[0][-1] == "Λ‡"
and bopomofos[1][-1] == "Λ‡"
and bopomofos[-1][-1] == "Λ‡"
):
bopomofos[0] = bopomofos[0] + "ˊ"
bopomofos[1] = bopomofos[1] + "ˊ"
if len(word) == 2 and bopomofos[0][-1] == "Λ‡" and bopomofos[-1][-1] == "Λ‡":
bopomofos[0] = bopomofos[0][:-1] + "ˊ"
bopomofos = bu_sandhi(word, bopomofos)
bopomofos = yi_sandhi(word, bopomofos)
bopomofos = er_sandhi(word, bopomofos)
if not re.search("[\u4e00-\u9fff]", word):
text += "|" + word
continue
for i in range(len(bopomofos)):
bopomofos[i] = re.sub(r"([\u3105-\u3129])$", r"\1Λ‰", bopomofos[i])
if text != "":
text += "|"
text += "|".join(bopomofos)
return text
# Convert latin pronunciation to pinyin (bopomofo)
def latin_to_bopomofo(text):
for regex, replacement in _latin_to_bopomofo:
text = re.sub(regex, replacement, text)
return text
# Convert pinyin (bopomofo) to IPA
def bopomofo_to_ipa(text):
for regex, replacement in _bopomofo_to_ipa:
text = re.sub(regex, replacement, text)
return text
def _chinese_to_ipa(text, sentence):
text = number_to_chinese(text.strip())
text = normalization(text)
text = chinese_to_bopomofo(text, sentence)
# pinyin = bpmf_to_pinyin(text)
text = latin_to_bopomofo(text)
text = bopomofo_to_ipa(text)
text = re.sub("([sΙΉ]`[⁼ʰ]?)([→↓↑ ]+|$)", r"\1ΙΉ\2", text)
text = re.sub("([s][⁼ʰ]?)([→↓↑ ]+|$)", r"\1ΙΉ\2", text)
text = re.sub(r"^\||[^\w\s_,\.\?!;:\'…\|→↓↑⁼ʰ`]", "", text)
text = re.sub(r"([,\.\?!;:\'…])", r"|\1|", text)
text = re.sub(r"\|+", "|", text)
text = text.rstrip("|")
return text
# Convert Chinese to IPA
def chinese_to_ipa(text, sentence, text_tokenizer):
# phonemes = text_tokenizer(text.strip())
if type(text) == str:
return _chinese_to_ipa(text, sentence)
else:
result_ph = []
for t in text:
result_ph.append(_chinese_to_ipa(t, sentence))
return result_ph