# Copyright (c) 2024 Amphion. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import re """ Text clean time """ # List of (regular expression, replacement) pairs for abbreviations in french: _abbreviations = [ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) for x in [ ("M", "monsieur"), ("Mlle", "mademoiselle"), ("Mlles", "mesdemoiselles"), ("Mme", "Madame"), ("Mmes", "Mesdames"), ("N.B", "nota bene"), ("M", "monsieur"), ("p.c.q", "parce que"), ("Pr", "professeur"), ("qqch", "quelque chose"), ("rdv", "rendez-vous"), ("max", "maximum"), ("min", "minimum"), ("no", "numéro"), ("adr", "adresse"), ("dr", "docteur"), ("st", "saint"), ("co", "companie"), ("jr", "junior"), ("sgt", "sergent"), ("capt", "capitain"), ("col", "colonel"), ("av", "avenue"), ("av. J.-C", "avant Jésus-Christ"), ("apr. J.-C", "après Jésus-Christ"), ("art", "article"), ("boul", "boulevard"), ("c.-à-d", "c’est-à-dire"), ("etc", "et cetera"), ("ex", "exemple"), ("excl", "exclusivement"), ("boul", "boulevard"), ] ] + [ (re.compile("\\b%s" % x[0]), x[1]) for x in [ ("Mlle", "mademoiselle"), ("Mlles", "mesdemoiselles"), ("Mme", "Madame"), ("Mmes", "Mesdames"), ] ] rep_map = { ":": ",", ";": ",", ",": ",", "。": ".", "!": "!", "?": "?", "\n": ".", "·": ",", "、": ",", "...": ".", "…": ".", "$": ".", "“": "", "”": "", "‘": "", "’": "", "(": "", ")": "", "(": "", ")": "", "《": "", "》": "", "【": "", "】": "", "[": "", "]": "", "—": "", "~": "-", "~": "-", "「": "", "」": "", "¿": "", "¡": "", } def collapse_whitespace(text): # Regular expression matching whitespace: _whitespace_re = re.compile(r"\s+") return re.sub(_whitespace_re, " ", text).strip() def remove_punctuation_at_begin(text): return re.sub(r"^[,.!?]+", "", text) def remove_aux_symbols(text): text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text) return text def replace_symbols(text): text = text.replace(";", ",") text = text.replace("-", " ") text = text.replace(":", ",") text = text.replace("&", " et ") return text def expand_abbreviations(text): for regex, replacement in _abbreviations: text = re.sub(regex, replacement, text) return text def replace_punctuation(text): pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) return replaced_text def text_normalize(text): text = expand_abbreviations(text) text = replace_punctuation(text) text = replace_symbols(text) text = remove_aux_symbols(text) text = remove_punctuation_at_begin(text) text = collapse_whitespace(text) text = re.sub(r"([^\.,!\?\-…])$", r"\1", text) return text def french_to_ipa(text, text_tokenizer): if type(text) == str: text = text_normalize(text) phonemes = text_tokenizer(text) return phonemes else: for i, t in enumerate(text): text[i] = text_normalize(t) return text_tokenizer(text)