ing0's picture
infer
b96e750
raw
history blame
1.98 kB
# Copyright (c) 2024 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import re
"""
Text clean time
"""
rep_map = {
":": ",",
";": ",",
",": ",",
"。": ".",
"!": "!",
"?": "?",
"\n": ".",
"·": ",",
"、": ",",
"...": ".",
"…": ".",
"$": ".",
"“": "",
"”": "",
"‘": "",
"’": "",
"(": "",
")": "",
"(": "",
")": "",
"《": "",
"》": "",
"【": "",
"】": "",
"[": "",
"]": "",
"—": "",
"~": "-",
"~": "-",
"「": "",
"」": "",
"¿": "",
"¡": "",
}
def collapse_whitespace(text):
# Regular expression matching whitespace:
_whitespace_re = re.compile(r"\s+")
return re.sub(_whitespace_re, " ", text).strip()
def remove_punctuation_at_begin(text):
return re.sub(r"^[,.!?]+", "", text)
def remove_aux_symbols(text):
text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text)
return text
def replace_symbols(text):
text = text.replace(";", ",")
text = text.replace("-", " ")
text = text.replace(":", ",")
return text
def replace_punctuation(text):
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
return replaced_text
def text_normalize(text):
text = replace_punctuation(text)
text = replace_symbols(text)
text = remove_aux_symbols(text)
text = remove_punctuation_at_begin(text)
text = collapse_whitespace(text)
text = re.sub(r"([^\.,!\?\-…])$", r"\1", text)
return text
def german_to_ipa(text, text_tokenizer):
if type(text) == str:
text = text_normalize(text)
phonemes = text_tokenizer(text)
return phonemes
else:
for i, t in enumerate(text):
text[i] = text_normalize(t)
return text_tokenizer(text)