TTTS / ttts /prepare /bpe_all_text_to_one_file.py
mrfakename's picture
Add source code
4ee33aa
raw
history blame contribute delete
871 Bytes
import re
from pypinyin import pinyin, lazy_pinyin, Style
with open('data/bpe_train-set.txt', 'w') as out:
#aishell3
with open('data/label_train-set.txt', 'r') as f:
for i,line in enumerate(f):
if i<5:
continue
text = line.strip().split('|')[2].replace('% ','').replace('$','').replace('%','')
pinyin = ' '.join(lazy_pinyin(text, style=Style.TONE3, neutral_tone_with_five=True))
out.write(pinyin+'\n')
#data-baker 1w
with open('data/000001-010000.txt', 'r') as f:
for i,line in enumerate(f):
if i%2==1:
continue
text = line.strip().split('\t')[1]
text = re.sub(r'[#\d]', '', text)
pinyin = ' '.join(lazy_pinyin(text, style=Style.TONE3, neutral_tone_with_five=True))
out.write(pinyin+'\n')