Spaces:
Runtime error
Runtime error
from __future__ import annotations | |
from gruut import sentences | |
import os | |
import re | |
class Tokenizer(): | |
def __init__(self, path) -> None: | |
with open(os.path.join(path, "vocab.txt"), "r", encoding="utf-8") as vocab_file: | |
self.symbols = vocab_file.read().split("\n") | |
self.symbols = list(map(chr, list(map(int, self.symbols)))) | |
self.symbol_to_id = {s: i for i, s in enumerate(self.symbols)} | |
def _ru_phonems(self, text: str) -> str: | |
text = text.lower() | |
phonemes = "" | |
for sent in sentences(text, lang="ru"): | |
for word in sent: | |
if word.phonemes: | |
phonemes += "".join(word.phonemes) | |
phonemes = re.sub(re.compile(r'\s+'), ' ', phonemes).lstrip().rstrip() | |
return phonemes | |
def _text_to_sequence(self, text: str) -> list[int]: | |
'''convert text to seq''' | |
sequence = [] | |
clean_text = self._ru_phonems(text) | |
for symbol in clean_text: | |
symbol_id = self.symbol_to_id[symbol] | |
sequence += [symbol_id] | |
return sequence | |
def _get_seq(self, text: str) -> list[int]: | |
seq = self._text_to_sequence(text) | |
return seq |