Spaces:

mrfakename
/

TTTS

Build error

App Files Files Community

TTTS / ttts /gpt /voice_tokenizer.py

mrfakename

Add source code

4ee33aa over 1 year ago

raw

history blame contribute delete

3.87 kB

	import re

	import torch
	from tokenizers import Tokenizer
	from tokenizers.models import BPE
	from tokenizers.pre_tokenizers import Whitespace
	from tokenizers.trainers import BpeTrainer

	# from data.audio.paired_voice_audio_dataset import load_mozilla_cv, load_voxpopuli, load_tsv
	# from models.audio.tts.tacotron2 import load_filepaths_and_text
	# from models.audio.tts.tacotron2.text.cleaners import english_cleaners


	def remove_extraneous_punctuation(word):
	replacement_punctuation = {
	'{': '(', '}': ')',
	'[': '(', ']': ')',
	'`': '\'', '—': '-',
	'—': '-', '`': '\'',
	'ʼ': '\''
	}
	replace = re.compile("\|".join([re.escape(k) for k in sorted(replacement_punctuation, key=len, reverse=True)]), flags=re.DOTALL)
	word = replace.sub(lambda x: replacement_punctuation[x.group(0)], word)

	# TODO: some of these are spoken ('@', '%', '+', etc). Integrate them into the cleaners.
	extraneous = re.compile(r'^[@#%_=\$\^&\*\+\\]$')
	word = extraneous.sub('', word)
	return word


	class VoiceBpeTokenizer:
	def __init__(self, vocab_file):
	if vocab_file is not None:
	self.tokenizer = Tokenizer.from_file(vocab_file)

	def preprocess_text(self, txt):
	# txt = english_cleaners(txt)
	txt = remove_extraneous_punctuation(txt)
	return txt

	def encode(self, txt):
	txt = self.preprocess_text(txt)
	txt = txt.replace(' ', '[SPACE]')
	return self.tokenizer.encode(txt).ids

	def decode(self, seq):
	if isinstance(seq, torch.Tensor):
	seq = seq.cpu().numpy()
	txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(' ', '')
	txt = txt.replace('[SPACE]', ' ')
	txt = txt.replace('[STOP]', '')
	txt = txt.replace('[UNK]', '')

	return txt


	def train():
	with open('data/bpe_train-set.txt', 'r', encoding='utf-8') as at:
	ttsd = at.readlines()
	#bcd = datasets.load_dataset('bookcorpus', cache_dir='Z:\\huggingface_datasets\\cache')['train']

	#allowed_characters_re = re.compile(r'^[0-9a-z!@#%_=:;"/, \-\$\^&\*\+\{\[\]\}\\\.\'\?—–ʼ]+$')
	allowed_characters_re = re.compile(r'^[0-9a-z!:;"/, \-\.\'\?ʼ，。？：；’‘”“、！…（）]+$')
	def preprocess_word(word, report=False):
	# word = english_cleaners(word)
	word = remove_extraneous_punctuation(word)
	if not bool(allowed_characters_re.match(word)):
	if report and word:
	print(f"REPORTING: '{word}'")
	return ''
	return word

	def batch_iterator(batch_size=1000):
	print("Processing ASR texts.")
	for i in range(0, len(ttsd), batch_size):
	yield [preprocess_word(t, True) for t in ttsd[i:i+batch_size]]

	#print("Processing bookcorpus.")
	#for i in range(0, len(bcd), batch_size):
	# yield [preprocess_word(t) for t in bcd[i:i+batch_size]['text']]

	trainer = BpeTrainer(special_tokens=['[STOP]', '[UNK]', '[SPACE]'], vocab_size=255)
	tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
	tokenizer.pre_tokenizer = Whitespace()
	tokenizer.train_from_iterator(batch_iterator(), trainer, length=len(ttsd))#+len(bcd))

	print(tokenizer.decode(tokenizer.encode("i was traveling throughhadslfghds the woods in 1235375t137{{}}").ids))

	tokenizer.save('gpt/gpt_tts_tokenizer.json')


	def test():
	tok = VoiceBpeTokenizer('gpt/gpt_tts_tokenizer.json')
	with open('data/bpe_train-set.txt', 'r', encoding='utf-8') as at:
	ttsd = at.readlines()
	for line in ttsd:
	line = line.strip()
	seq = tok.encode(line)
	out = tok.decode(seq)
	print(f">>>{line}")
	print(f"<<<{out}")


	if __name__ == '__main__':
	'''
	python script/all_text_to_one_file.py
	'''
	# train()
	test()