Upload 2 files
Browse files
app.py
CHANGED
|
@@ -96,19 +96,24 @@ def point_num(num):
|
|
| 96 |
a, b = num.group().split('.')
|
| 97 |
return ' point '.join([a, ' '.join(b)])
|
| 98 |
|
| 99 |
-
def
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
text = re.sub(r'\bD[Rr]\.(?= [A-Z])', 'Doctor', text)
|
| 102 |
text = re.sub(r'\b(?:Mr\.|MR\.(?= [A-Z]))', 'Mister', text)
|
| 103 |
text = re.sub(r'\b(?:Ms\.|MS\.(?= [A-Z]))', 'Miss', text)
|
| 104 |
text = re.sub(r'\b(?:Mrs\.|MRS\.(?= [A-Z]))', 'Mrs', text)
|
| 105 |
text = re.sub(r'\betc\.(?! [A-Z])', 'etc', text)
|
| 106 |
text = re.sub(r'(?i)\b(y)eah?\b', r"\1e'a", text)
|
| 107 |
-
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
|
| 108 |
-
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
|
| 109 |
-
text = re.sub(r'[^\S \n]', ' ', text)
|
| 110 |
-
text = re.sub(r' +', ' ', text)
|
| 111 |
-
text = re.sub(r'(?<=\n) +(?=\n)', '', text)
|
| 112 |
text = re.sub(r'\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)', split_num, text)
|
| 113 |
text = re.sub(r'(?<=\d),(?=\d)', '', text)
|
| 114 |
text = re.sub(r'(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b', flip_money, text)
|
|
@@ -119,7 +124,7 @@ def normalize(text):
|
|
| 119 |
text = re.sub(r"(?<=X')S\b", 's', text)
|
| 120 |
text = re.sub(r'(?:[A-Za-z]\.){2,} [a-z]', lambda m: m.group().replace('.', '-'), text)
|
| 121 |
text = re.sub(r'(?i)(?<=[A-Z])\.(?=[A-Z])', '-', text)
|
| 122 |
-
return
|
| 123 |
|
| 124 |
phonemizers = dict(
|
| 125 |
a=phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True),
|
|
@@ -178,7 +183,7 @@ def resolve_voices(voice, warn=True):
|
|
| 178 |
def phonemize(text, voice, norm=True):
|
| 179 |
lang = resolve_voices(voice)[0][0]
|
| 180 |
if norm:
|
| 181 |
-
text =
|
| 182 |
ps = phonemizers[lang].phonemize([text])
|
| 183 |
ps = ps[0] if ps else ''
|
| 184 |
# TODO: Custom phonemization rules?
|
|
@@ -438,9 +443,10 @@ def recursive_split(text, voice):
|
|
| 438 |
return recursive_split(a, voice) + recursive_split(b, voice)
|
| 439 |
|
| 440 |
def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2):
|
|
|
|
| 441 |
if skip_square_brackets:
|
| 442 |
text = re.sub(r'\[.*?\]', '', text)
|
| 443 |
-
texts = [t.strip() for t in re.split('\n{'+str(newline_split)+',}',
|
| 444 |
segments = [row for t in texts for row in recursive_split(t, voice)]
|
| 445 |
return [(i, *row) for i, row in enumerate(segments)]
|
| 446 |
|
|
|
|
| 96 |
a, b = num.group().split('.')
|
| 97 |
return ' point '.join([a, ' '.join(b)])
|
| 98 |
|
| 99 |
+
def normalize_text(text, lang):
|
| 100 |
+
text = text.replace(chr(8216), "'").replace(chr(8217), "'")
|
| 101 |
+
text = text.replace('«', chr(8220)).replace('»', chr(8221))
|
| 102 |
+
text = text.replace(chr(8220), '"').replace(chr(8221), '"')
|
| 103 |
+
text = parens_to_angles(text)
|
| 104 |
+
for a, b in zip('、。!,:;?', ',.!,:;?'):
|
| 105 |
+
text = text.replace(a, b+' ')
|
| 106 |
+
text = re.sub(r'[^\S \n]', ' ', text)
|
| 107 |
+
text = re.sub(r' +', ' ', text)
|
| 108 |
+
text = re.sub(r'(?<=\n) +(?=\n)', '', text)
|
| 109 |
+
if lang == 'j':
|
| 110 |
+
return text.strip()
|
| 111 |
text = re.sub(r'\bD[Rr]\.(?= [A-Z])', 'Doctor', text)
|
| 112 |
text = re.sub(r'\b(?:Mr\.|MR\.(?= [A-Z]))', 'Mister', text)
|
| 113 |
text = re.sub(r'\b(?:Ms\.|MS\.(?= [A-Z]))', 'Miss', text)
|
| 114 |
text = re.sub(r'\b(?:Mrs\.|MRS\.(?= [A-Z]))', 'Mrs', text)
|
| 115 |
text = re.sub(r'\betc\.(?! [A-Z])', 'etc', text)
|
| 116 |
text = re.sub(r'(?i)\b(y)eah?\b', r"\1e'a", text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
text = re.sub(r'\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)', split_num, text)
|
| 118 |
text = re.sub(r'(?<=\d),(?=\d)', '', text)
|
| 119 |
text = re.sub(r'(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b', flip_money, text)
|
|
|
|
| 124 |
text = re.sub(r"(?<=X')S\b", 's', text)
|
| 125 |
text = re.sub(r'(?:[A-Za-z]\.){2,} [a-z]', lambda m: m.group().replace('.', '-'), text)
|
| 126 |
text = re.sub(r'(?i)(?<=[A-Z])\.(?=[A-Z])', '-', text)
|
| 127 |
+
return text.strip()
|
| 128 |
|
| 129 |
phonemizers = dict(
|
| 130 |
a=phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True),
|
|
|
|
| 183 |
def phonemize(text, voice, norm=True):
|
| 184 |
lang = resolve_voices(voice)[0][0]
|
| 185 |
if norm:
|
| 186 |
+
text = normalize_text(text, lang)
|
| 187 |
ps = phonemizers[lang].phonemize([text])
|
| 188 |
ps = ps[0] if ps else ''
|
| 189 |
# TODO: Custom phonemization rules?
|
|
|
|
| 443 |
return recursive_split(a, voice) + recursive_split(b, voice)
|
| 444 |
|
| 445 |
def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2):
|
| 446 |
+
lang = resolve_voices(voice)[0][0]
|
| 447 |
if skip_square_brackets:
|
| 448 |
text = re.sub(r'\[.*?\]', '', text)
|
| 449 |
+
texts = [t.strip() for t in re.split('\n{'+str(newline_split)+',}', normalize_text(text, lang))] if newline_split > 0 else [normalize_text(text, lang)]
|
| 450 |
segments = [row for t in texts for row in recursive_split(t, voice)]
|
| 451 |
return [(i, *row) for i, row in enumerate(segments)]
|
| 452 |
|
katsu.py
CHANGED
|
@@ -231,6 +231,7 @@ HEPBURN.update({
|
|
| 231 |
'『': '"',
|
| 232 |
'』': '"',
|
| 233 |
':': ':',
|
|
|
|
| 234 |
'(': '(',
|
| 235 |
')': ')',
|
| 236 |
'《': '(',
|
|
|
|
| 231 |
'『': '"',
|
| 232 |
'』': '"',
|
| 233 |
':': ':',
|
| 234 |
+
';': ';',
|
| 235 |
'(': '(',
|
| 236 |
')': ')',
|
| 237 |
'《': '(',
|