Spaces:
Build error
Build error
| import re | |
| import torch | |
| from tokenizers import Tokenizer | |
| from tokenizers.models import BPE | |
| from tokenizers.pre_tokenizers import Whitespace | |
| from tokenizers.trainers import BpeTrainer | |
| # from data.audio.paired_voice_audio_dataset import load_mozilla_cv, load_voxpopuli, load_tsv | |
| # from models.audio.tts.tacotron2 import load_filepaths_and_text | |
| # from models.audio.tts.tacotron2.text.cleaners import english_cleaners | |
| def remove_extraneous_punctuation(word): | |
| replacement_punctuation = { | |
| '{': '(', '}': ')', | |
| '[': '(', ']': ')', | |
| '`': '\'', '—': '-', | |
| '—': '-', '`': '\'', | |
| 'ʼ': '\'' | |
| } | |
| replace = re.compile("|".join([re.escape(k) for k in sorted(replacement_punctuation, key=len, reverse=True)]), flags=re.DOTALL) | |
| word = replace.sub(lambda x: replacement_punctuation[x.group(0)], word) | |
| # TODO: some of these are spoken ('@', '%', '+', etc). Integrate them into the cleaners. | |
| extraneous = re.compile(r'^[@#%_=\$\^&\*\+\\]$') | |
| word = extraneous.sub('', word) | |
| return word | |
| class VoiceBpeTokenizer: | |
| def __init__(self, vocab_file): | |
| if vocab_file is not None: | |
| self.tokenizer = Tokenizer.from_file(vocab_file) | |
| def preprocess_text(self, txt): | |
| # txt = english_cleaners(txt) | |
| txt = remove_extraneous_punctuation(txt) | |
| return txt | |
| def encode(self, txt): | |
| txt = self.preprocess_text(txt) | |
| txt = txt.replace(' ', '[SPACE]') | |
| return self.tokenizer.encode(txt).ids | |
| def decode(self, seq): | |
| if isinstance(seq, torch.Tensor): | |
| seq = seq.cpu().numpy() | |
| txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(' ', '') | |
| txt = txt.replace('[SPACE]', ' ') | |
| txt = txt.replace('[STOP]', '') | |
| txt = txt.replace('[UNK]', '') | |
| return txt | |
| def train(): | |
| with open('data/bpe_train-set.txt', 'r', encoding='utf-8') as at: | |
| ttsd = at.readlines() | |
| #bcd = datasets.load_dataset('bookcorpus', cache_dir='Z:\\huggingface_datasets\\cache')['train'] | |
| #allowed_characters_re = re.compile(r'^[0-9a-z!@#%_=:;"/, \-\$\^&\*\(\)\+\{\[\]\}\\\.\'\?—–ʼ]+$') | |
| allowed_characters_re = re.compile(r'^[0-9a-z!:;"/, \-\(\)\.\'\?ʼ,。?:;’‘”“、!…()]+$') | |
| def preprocess_word(word, report=False): | |
| # word = english_cleaners(word) | |
| word = remove_extraneous_punctuation(word) | |
| if not bool(allowed_characters_re.match(word)): | |
| if report and word: | |
| print(f"REPORTING: '{word}'") | |
| return '' | |
| return word | |
| def batch_iterator(batch_size=1000): | |
| print("Processing ASR texts.") | |
| for i in range(0, len(ttsd), batch_size): | |
| yield [preprocess_word(t, True) for t in ttsd[i:i+batch_size]] | |
| #print("Processing bookcorpus.") | |
| #for i in range(0, len(bcd), batch_size): | |
| # yield [preprocess_word(t) for t in bcd[i:i+batch_size]['text']] | |
| trainer = BpeTrainer(special_tokens=['[STOP]', '[UNK]', '[SPACE]'], vocab_size=255) | |
| tokenizer = Tokenizer(BPE(unk_token="[UNK]")) | |
| tokenizer.pre_tokenizer = Whitespace() | |
| tokenizer.train_from_iterator(batch_iterator(), trainer, length=len(ttsd))#+len(bcd)) | |
| print(tokenizer.decode(tokenizer.encode("i was traveling throughhadslfghds the woods in 1235375t137{{}}").ids)) | |
| tokenizer.save('gpt/gpt_tts_tokenizer.json') | |
| def test(): | |
| tok = VoiceBpeTokenizer('gpt/gpt_tts_tokenizer.json') | |
| with open('data/bpe_train-set.txt', 'r', encoding='utf-8') as at: | |
| ttsd = at.readlines() | |
| for line in ttsd: | |
| line = line.strip() | |
| seq = tok.encode(line) | |
| out = tok.decode(seq) | |
| print(f">>>{line}") | |
| print(f"<<<{out}") | |
| if __name__ == '__main__': | |
| ''' | |
| python script/all_text_to_one_file.py | |
| ''' | |
| # train() | |
| test() |