GameServerZ

Running

File size: 15,759 Bytes

864affd

import re
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple, Union

import torch
from torch import Tensor
from torchaudio._internal import load_state_dict_from_url
from torchaudio.functional import mu_law_decoding
from torchaudio.models import Tacotron2, WaveRNN
from torchaudio.transforms import GriffinLim, InverseMelScale

from . import utils
from .interface import Tacotron2TTSBundle

__all__ = []

_BASE_URL = "https://download.pytorch.org/torchaudio/models"


################################################################################
# Pipeline implementation - Text Processor
################################################################################


class _EnglishCharProcessor(Tacotron2TTSBundle.TextProcessor):
    def __init__(self):
        super().__init__()
        self._tokens = utils._get_chars()
        self._mapping = {s: i for i, s in enumerate(self._tokens)}

    @property
    def tokens(self):
        return self._tokens

    def __call__(self, texts: Union[str, List[str]]) -> Tuple[Tensor, Tensor]:
        if isinstance(texts, str):
            texts = [texts]
        indices = [[self._mapping[c] for c in t.lower() if c in self._mapping] for t in texts]
        return utils._to_tensor(indices)


class _EnglishPhoneProcessor(Tacotron2TTSBundle.TextProcessor):
    def __init__(self, *, dl_kwargs=None):
        super().__init__()
        self._tokens = utils._get_phones()
        self._mapping = {p: i for i, p in enumerate(self._tokens)}
        self._phonemizer = utils._load_phonemizer("en_us_cmudict_forward.pt", dl_kwargs=dl_kwargs)
        self._pattern = r"(\[[A-Z]+?\]|[_!'(),.:;? -])"

    @property
    def tokens(self):
        return self._tokens

    def __call__(self, texts: Union[str, List[str]]) -> Tuple[Tensor, Tensor]:
        if isinstance(texts, str):
            texts = [texts]

        indices = []
        for phones in self._phonemizer(texts, lang="en_us"):
            # '[F][UW][B][AA][R]!' -> ['F', 'UW', 'B', 'AA', 'R', '!']
            ret = [re.sub(r"[\[\]]", "", r) for r in re.findall(self._pattern, phones)]
            indices.append([self._mapping[p] for p in ret])
        return utils._to_tensor(indices)


################################################################################
# Pipeline implementation - Vocoder
################################################################################


class _WaveRNNVocoder(torch.nn.Module, Tacotron2TTSBundle.Vocoder):
    def __init__(self, model: WaveRNN, min_level_db: Optional[float] = -100):
        super().__init__()
        self._sample_rate = 22050
        self._model = model
        self._min_level_db = min_level_db

    @property
    def sample_rate(self):
        return self._sample_rate

    def forward(self, mel_spec, lengths=None):
        mel_spec = torch.exp(mel_spec)
        mel_spec = 20 * torch.log10(torch.clamp(mel_spec, min=1e-5))
        if self._min_level_db is not None:
            mel_spec = (self._min_level_db - mel_spec) / self._min_level_db
            mel_spec = torch.clamp(mel_spec, min=0, max=1)
        waveform, lengths = self._model.infer(mel_spec, lengths)
        waveform = utils._unnormalize_waveform(waveform, self._model.n_bits)
        waveform = mu_law_decoding(waveform, self._model.n_classes)
        waveform = waveform.squeeze(1)
        return waveform, lengths


class _GriffinLimVocoder(torch.nn.Module, Tacotron2TTSBundle.Vocoder):
    def __init__(self):
        super().__init__()
        self._sample_rate = 22050
        self._inv_mel = InverseMelScale(
            n_stft=(1024 // 2 + 1),
            n_mels=80,
            sample_rate=self.sample_rate,
            f_min=0.0,
            f_max=8000.0,
            mel_scale="slaney",
            norm="slaney",
        )
        self._griffin_lim = GriffinLim(
            n_fft=1024,
            power=1,
            hop_length=256,
            win_length=1024,
        )

    @property
    def sample_rate(self):
        return self._sample_rate

    def forward(self, mel_spec, lengths=None):
        mel_spec = torch.exp(mel_spec)
        mel_spec = mel_spec.clone().detach().requires_grad_(True)
        spec = self._inv_mel(mel_spec)
        spec = spec.detach().requires_grad_(False)
        waveforms = self._griffin_lim(spec)
        return waveforms, lengths


################################################################################
# Bundle classes mixins
################################################################################


class _CharMixin:
    def get_text_processor(self) -> Tacotron2TTSBundle.TextProcessor:
        return _EnglishCharProcessor()


class _PhoneMixin:
    def get_text_processor(self, *, dl_kwargs=None) -> Tacotron2TTSBundle.TextProcessor:
        return _EnglishPhoneProcessor(dl_kwargs=dl_kwargs)


@dataclass
class _Tacotron2Mixin:
    _tacotron2_path: str
    _tacotron2_params: Dict[str, Any]

    def get_tacotron2(self, *, dl_kwargs=None) -> Tacotron2:
        model = Tacotron2(**self._tacotron2_params)
        url = f"{_BASE_URL}/{self._tacotron2_path}"
        dl_kwargs = {} if dl_kwargs is None else dl_kwargs
        state_dict = load_state_dict_from_url(url, **dl_kwargs)
        model.load_state_dict(state_dict)
        model.eval()
        return model


@dataclass
class _WaveRNNMixin:
    _wavernn_path: Optional[str]
    _wavernn_params: Optional[Dict[str, Any]]

    def get_vocoder(self, *, dl_kwargs=None):
        wavernn = self._get_wavernn(dl_kwargs=dl_kwargs)
        return _WaveRNNVocoder(wavernn)

    def _get_wavernn(self, *, dl_kwargs=None):
        model = WaveRNN(**self._wavernn_params)
        url = f"{_BASE_URL}/{self._wavernn_path}"
        dl_kwargs = {} if dl_kwargs is None else dl_kwargs
        state_dict = load_state_dict_from_url(url, **dl_kwargs)
        model.load_state_dict(state_dict)
        model.eval()
        return model


class _GriffinLimMixin:
    def get_vocoder(self, **_):
        return _GriffinLimVocoder()


################################################################################
# Bundle classes
################################################################################


@dataclass
class _Tacotron2WaveRNNCharBundle(_WaveRNNMixin, _Tacotron2Mixin, _CharMixin, Tacotron2TTSBundle):
    pass


@dataclass
class _Tacotron2WaveRNNPhoneBundle(_WaveRNNMixin, _Tacotron2Mixin, _PhoneMixin, Tacotron2TTSBundle):
    pass


@dataclass
class _Tacotron2GriffinLimCharBundle(_GriffinLimMixin, _Tacotron2Mixin, _CharMixin, Tacotron2TTSBundle):
    pass


@dataclass
class _Tacotron2GriffinLimPhoneBundle(_GriffinLimMixin, _Tacotron2Mixin, _PhoneMixin, Tacotron2TTSBundle):
    pass


################################################################################
# Instantiate bundle objects
################################################################################


TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH = _Tacotron2GriffinLimCharBundle(
    _tacotron2_path="tacotron2_english_characters_1500_epochs_ljspeech.pth",
    _tacotron2_params=utils._get_taco_params(n_symbols=38),
)
TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH.__doc__ = """Character-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs, and

:py:class:`~torchaudio.transforms.GriffinLim` as vocoder.



The text processor encodes the input texts character-by-character.



You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.

The default parameters were used.



Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.



Example - "Hello world! T T S stands for Text to Speech!"



   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH.png

      :alt: Spectrogram generated by Tacotron2



   .. raw:: html



      <audio controls="controls">

         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH.wav" type="audio/wav">

         Your browser does not support the <code>audio</code> element.

      </audio>



Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"



   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH_v2.png

      :alt: Spectrogram generated by Tacotron2



   .. raw:: html



      <audio controls="controls">

         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH_v2.wav" type="audio/wav">

         Your browser does not support the <code>audio</code> element.

      </audio>

"""  # noqa: E501

TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH = _Tacotron2GriffinLimPhoneBundle(
    _tacotron2_path="tacotron2_english_phonemes_1500_epochs_ljspeech.pth",
    _tacotron2_params=utils._get_taco_params(n_symbols=96),
)
TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH.__doc__ = """Phoneme-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs and

:py:class:`~torchaudio.transforms.GriffinLim` as vocoder.



The text processor encodes the input texts based on phoneme.

It uses `DeepPhonemizer <https://github.com/as-ideas/DeepPhonemizer>`__ to convert

graphemes to phonemes.

The model (*en_us_cmudict_forward*) was trained on

`CMUDict <http://www.speech.cs.cmu.edu/cgi-bin/cmudict>`__.



You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.

The text processor is set to the *"english_phonemes"*.



Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.



Example - "Hello world! T T S stands for Text to Speech!"



   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH.png

      :alt: Spectrogram generated by Tacotron2



   .. raw:: html



      <audio controls="controls">

         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH.wav" type="audio/wav">

         Your browser does not support the <code>audio</code> element.

      </audio>



Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"



   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH_v2.png

      :alt: Spectrogram generated by Tacotron2



   .. raw:: html



      <audio controls="controls">

         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH_v2.wav" type="audio/wav">

         Your browser does not support the <code>audio</code> element.

      </audio>



"""  # noqa: E501

TACOTRON2_WAVERNN_CHAR_LJSPEECH = _Tacotron2WaveRNNCharBundle(
    _tacotron2_path="tacotron2_english_characters_1500_epochs_wavernn_ljspeech.pth",
    _tacotron2_params=utils._get_taco_params(n_symbols=38),
    _wavernn_path="wavernn_10k_epochs_8bits_ljspeech.pth",
    _wavernn_params=utils._get_wrnn_params(),
)
TACOTRON2_WAVERNN_CHAR_LJSPEECH.__doc__ = """Character-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs and :py:class:`~torchaudio.models.WaveRNN` vocoder trained on 8 bits depth waveform of *LJSpeech* :cite:`ljspeech17` for 10,000 epochs.



The text processor encodes the input texts character-by-character.



You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.

The following parameters were used; ``win_length=1100``, ``hop_length=275``, ``n_fft=2048``,

``mel_fmin=40``, and ``mel_fmax=11025``.



You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_wavernn>`__.



Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.



Example - "Hello world! T T S stands for Text to Speech!"



   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH.png

      :alt: Spectrogram generated by Tacotron2



   .. raw:: html



      <audio controls="controls">

         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH.wav" type="audio/wav">

         Your browser does not support the <code>audio</code> element.

      </audio>



Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"



   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH_v2.png

      :alt: Spectrogram generated by Tacotron2



   .. raw:: html



      <audio controls="controls">

         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH_v2.wav" type="audio/wav">

         Your browser does not support the <code>audio</code> element.

      </audio>

"""  # noqa: E501

TACOTRON2_WAVERNN_PHONE_LJSPEECH = _Tacotron2WaveRNNPhoneBundle(
    _tacotron2_path="tacotron2_english_phonemes_1500_epochs_wavernn_ljspeech.pth",
    _tacotron2_params=utils._get_taco_params(n_symbols=96),
    _wavernn_path="wavernn_10k_epochs_8bits_ljspeech.pth",
    _wavernn_params=utils._get_wrnn_params(),
)
TACOTRON2_WAVERNN_PHONE_LJSPEECH.__doc__ = """Phoneme-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs, and

:py:class:`~torchaudio.models.WaveRNN` vocoder trained on 8 bits depth waveform of *LJSpeech* :cite:`ljspeech17` for 10,000 epochs.



The text processor encodes the input texts based on phoneme.

It uses `DeepPhonemizer <https://github.com/as-ideas/DeepPhonemizer>`__ to convert

graphemes to phonemes.

The model (*en_us_cmudict_forward*) was trained on

`CMUDict <http://www.speech.cs.cmu.edu/cgi-bin/cmudict>`__.



You can find the training script for Tacotron2 `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.

The following parameters were used; ``win_length=1100``, ``hop_length=275``, ``n_fft=2048``,

``mel_fmin=40``, and ``mel_fmax=11025``.



You can find the training script for WaveRNN `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_wavernn>`__.



Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.



Example - "Hello world! T T S stands for Text to Speech!"



   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH.png

      :alt: Spectrogram generated by Tacotron2



   .. raw:: html



      <audio controls="controls">

         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH.wav" type="audio/wav">

         Your browser does not support the <code>audio</code> element.

      </audio>





Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"



   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH_v2.png

      :alt: Spectrogram generated by Tacotron2



   .. raw:: html



      <audio controls="controls">

         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH_v2.wav" type="audio/wav">

         Your browser does not support the <code>audio</code> element.

      </audio>

"""  # noqa: E501