TTTS / ttts /gpt /voice_tokenizer.py
mrfakename's picture
Add source code
4ee33aa
import re
import torch
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
# from data.audio.paired_voice_audio_dataset import load_mozilla_cv, load_voxpopuli, load_tsv
# from models.audio.tts.tacotron2 import load_filepaths_and_text
# from models.audio.tts.tacotron2.text.cleaners import english_cleaners
def remove_extraneous_punctuation(word):
replacement_punctuation = {
'{': '(', '}': ')',
'[': '(', ']': ')',
'`': '\'', '—': '-',
'—': '-', '`': '\'',
'ʼ': '\''
}
replace = re.compile("|".join([re.escape(k) for k in sorted(replacement_punctuation, key=len, reverse=True)]), flags=re.DOTALL)
word = replace.sub(lambda x: replacement_punctuation[x.group(0)], word)
# TODO: some of these are spoken ('@', '%', '+', etc). Integrate them into the cleaners.
extraneous = re.compile(r'^[@#%_=\$\^&\*\+\\]$')
word = extraneous.sub('', word)
return word
class VoiceBpeTokenizer:
def __init__(self, vocab_file):
if vocab_file is not None:
self.tokenizer = Tokenizer.from_file(vocab_file)
def preprocess_text(self, txt):
# txt = english_cleaners(txt)
txt = remove_extraneous_punctuation(txt)
return txt
def encode(self, txt):
txt = self.preprocess_text(txt)
txt = txt.replace(' ', '[SPACE]')
return self.tokenizer.encode(txt).ids
def decode(self, seq):
if isinstance(seq, torch.Tensor):
seq = seq.cpu().numpy()
txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(' ', '')
txt = txt.replace('[SPACE]', ' ')
txt = txt.replace('[STOP]', '')
txt = txt.replace('[UNK]', '')
return txt
def train():
with open('data/bpe_train-set.txt', 'r', encoding='utf-8') as at:
ttsd = at.readlines()
#bcd = datasets.load_dataset('bookcorpus', cache_dir='Z:\\huggingface_datasets\\cache')['train']
#allowed_characters_re = re.compile(r'^[0-9a-z!@#%_=:;"/, \-\$\^&\*\(\)\+\{\[\]\}\\\.\'\?—–ʼ]+$')
allowed_characters_re = re.compile(r'^[0-9a-z!:;"/, \-\(\)\.\'\?ʼ,。?:;’‘”“、!…()]+$')
def preprocess_word(word, report=False):
# word = english_cleaners(word)
word = remove_extraneous_punctuation(word)
if not bool(allowed_characters_re.match(word)):
if report and word:
print(f"REPORTING: '{word}'")
return ''
return word
def batch_iterator(batch_size=1000):
print("Processing ASR texts.")
for i in range(0, len(ttsd), batch_size):
yield [preprocess_word(t, True) for t in ttsd[i:i+batch_size]]
#print("Processing bookcorpus.")
#for i in range(0, len(bcd), batch_size):
# yield [preprocess_word(t) for t in bcd[i:i+batch_size]['text']]
trainer = BpeTrainer(special_tokens=['[STOP]', '[UNK]', '[SPACE]'], vocab_size=255)
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
tokenizer.train_from_iterator(batch_iterator(), trainer, length=len(ttsd))#+len(bcd))
print(tokenizer.decode(tokenizer.encode("i was traveling throughhadslfghds the woods in 1235375t137{{}}").ids))
tokenizer.save('gpt/gpt_tts_tokenizer.json')
def test():
tok = VoiceBpeTokenizer('gpt/gpt_tts_tokenizer.json')
with open('data/bpe_train-set.txt', 'r', encoding='utf-8') as at:
ttsd = at.readlines()
for line in ttsd:
line = line.strip()
seq = tok.encode(line)
out = tok.decode(seq)
print(f">>>{line}")
print(f"<<<{out}")
if __name__ == '__main__':
'''
python script/all_text_to_one_file.py
'''
# train()
test()