Spaces:
Paused
Paused
File size: 10,479 Bytes
864affd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 |
from abc import ABC, abstractmethod
from typing import List, Optional, Tuple, Union
from torch import Tensor
from torchaudio.models import Tacotron2
class _TextProcessor(ABC):
@property
@abstractmethod
def tokens(self):
"""The tokens that the each value in the processed tensor represent.
:type: List[str]
"""
@abstractmethod
def __call__(self, texts: Union[str, List[str]]) -> Tuple[Tensor, Tensor]:
"""Encode the given (batch of) texts into numerical tensors
Args:
text (str or list of str): The input texts.
Returns:
(Tensor, Tensor):
Tensor:
The encoded texts. Shape: `(batch, max length)`
Tensor:
The valid length of each sample in the batch. Shape: `(batch, )`.
"""
class _Vocoder(ABC):
@property
@abstractmethod
def sample_rate(self):
"""The sample rate of the resulting waveform
:type: float
"""
@abstractmethod
def __call__(self, specgrams: Tensor, lengths: Optional[Tensor] = None) -> Tuple[Tensor, Optional[Tensor]]:
"""Generate waveform from the given input, such as spectrogram
Args:
specgrams (Tensor):
The input spectrogram. Shape: `(batch, frequency bins, time)`.
The expected shape depends on the implementation.
lengths (Tensor, or None, optional):
The valid length of each sample in the batch. Shape: `(batch, )`.
(Default: `None`)
Returns:
(Tensor, Optional[Tensor]):
Tensor:
The generated waveform. Shape: `(batch, max length)`
Tensor or None:
The valid length of each sample in the batch. Shape: `(batch, )`.
"""
class Tacotron2TTSBundle(ABC):
"""Data class that bundles associated information to use pretrained Tacotron2 and vocoder.
This class provides interfaces for instantiating the pretrained model along with
the information necessary to retrieve pretrained weights and additional data
to be used with the model.
Torchaudio library instantiates objects of this class, each of which represents
a different pretrained model. Client code should access pretrained models via these
instances.
Please see below for the usage and the available values.
Example - Character-based TTS pipeline with Tacotron2 and WaveRNN
>>> import torchaudio
>>>
>>> text = "Hello, T T S !"
>>> bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH
>>>
>>> # Build processor, Tacotron2 and WaveRNN model
>>> processor = bundle.get_text_processor()
>>> tacotron2 = bundle.get_tacotron2()
Downloading:
100%|βββββββββββββββββββββββββββββββ| 107M/107M [00:01<00:00, 87.9MB/s]
>>> vocoder = bundle.get_vocoder()
Downloading:
100%|βββββββββββββββββββββββββββββββ| 16.7M/16.7M [00:00<00:00, 78.1MB/s]
>>>
>>> # Encode text
>>> input, lengths = processor(text)
>>>
>>> # Generate (mel-scale) spectrogram
>>> specgram, lengths, _ = tacotron2.infer(input, lengths)
>>>
>>> # Convert spectrogram to waveform
>>> waveforms, lengths = vocoder(specgram, lengths)
>>>
>>> torchaudio.save('hello-tts.wav', waveforms, vocoder.sample_rate)
Example - Phoneme-based TTS pipeline with Tacotron2 and WaveRNN
>>>
>>> # Note:
>>> # This bundle uses pre-trained DeepPhonemizer as
>>> # the text pre-processor.
>>> # Please install deep-phonemizer.
>>> # See https://github.com/as-ideas/DeepPhonemizer
>>> # The pretrained weight is automatically downloaded.
>>>
>>> import torchaudio
>>>
>>> text = "Hello, TTS!"
>>> bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH
>>>
>>> # Build processor, Tacotron2 and WaveRNN model
>>> processor = bundle.get_text_processor()
Downloading:
100%|βββββββββββββββββββββββββββββββ| 63.6M/63.6M [00:04<00:00, 15.3MB/s]
>>> tacotron2 = bundle.get_tacotron2()
Downloading:
100%|βββββββββββββββββββββββββββββββ| 107M/107M [00:01<00:00, 87.9MB/s]
>>> vocoder = bundle.get_vocoder()
Downloading:
100%|βββββββββββββββββββββββββββββββ| 16.7M/16.7M [00:00<00:00, 78.1MB/s]
>>>
>>> # Encode text
>>> input, lengths = processor(text)
>>>
>>> # Generate (mel-scale) spectrogram
>>> specgram, lengths, _ = tacotron2.infer(input, lengths)
>>>
>>> # Convert spectrogram to waveform
>>> waveforms, lengths = vocoder(specgram, lengths)
>>>
>>> torchaudio.save('hello-tts.wav', waveforms, vocoder.sample_rate)
"""
# Using the inner class so that these interfaces are not directly exposed on
# `torchaudio.pipelines`, but still listed in documentation.
# The thing is, text processing and vocoder are generic and we do not know what kind of
# new text processing and vocoder will be added in the future, so we want to make these
# interfaces specific to this Tacotron2TTS pipeline.
class TextProcessor(_TextProcessor):
"""Interface of the text processing part of Tacotron2TTS pipeline
See :func:`torchaudio.pipelines.Tacotron2TTSBundle.get_text_processor` for the usage.
"""
class Vocoder(_Vocoder):
"""Interface of the vocoder part of Tacotron2TTS pipeline
See :func:`torchaudio.pipelines.Tacotron2TTSBundle.get_vocoder` for the usage.
"""
@abstractmethod
def get_text_processor(self, *, dl_kwargs=None) -> TextProcessor:
"""Create a text processor
For character-based pipeline, this processor splits the input text by character.
For phoneme-based pipeline, this processor converts the input text (grapheme) to
phonemes.
If a pre-trained weight file is necessary,
:func:`torch.hub.download_url_to_file` is used to downloaded it.
Args:
dl_kwargs (dictionary of keyword arguments,):
Passed to :func:`torch.hub.download_url_to_file`.
Returns:
TextProcessor:
A callable which takes a string or a list of strings as input and
returns Tensor of encoded texts and Tensor of valid lengths.
The object also has ``tokens`` property, which allows to recover the
tokenized form.
Example - Character-based
>>> text = [
>>> "Hello World!",
>>> "Text-to-speech!",
>>> ]
>>> bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH
>>> processor = bundle.get_text_processor()
>>> input, lengths = processor(text)
>>>
>>> print(input)
tensor([[19, 16, 23, 23, 26, 11, 34, 26, 29, 23, 15, 2, 0, 0, 0],
[31, 16, 35, 31, 1, 31, 26, 1, 30, 27, 16, 16, 14, 19, 2]],
dtype=torch.int32)
>>>
>>> print(lengths)
tensor([12, 15], dtype=torch.int32)
>>>
>>> print([processor.tokens[i] for i in input[0, :lengths[0]]])
['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']
>>> print([processor.tokens[i] for i in input[1, :lengths[1]]])
['t', 'e', 'x', 't', '-', 't', 'o', '-', 's', 'p', 'e', 'e', 'c', 'h', '!']
Example - Phoneme-based
>>> text = [
>>> "Hello, T T S !",
>>> "Text-to-speech!",
>>> ]
>>> bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH
>>> processor = bundle.get_text_processor()
Downloading:
100%|βββββββββββββββββββββββββββββββ| 63.6M/63.6M [00:04<00:00, 15.3MB/s]
>>> input, lengths = processor(text)
>>>
>>> print(input)
tensor([[54, 20, 65, 69, 11, 92, 44, 65, 38, 2, 0, 0, 0, 0],
[81, 40, 64, 79, 81, 1, 81, 20, 1, 79, 77, 59, 37, 2]],
dtype=torch.int32)
>>>
>>> print(lengths)
tensor([10, 14], dtype=torch.int32)
>>>
>>> print([processor.tokens[i] for i in input[0]])
['HH', 'AH', 'L', 'OW', ' ', 'W', 'ER', 'L', 'D', '!', '_', '_', '_', '_']
>>> print([processor.tokens[i] for i in input[1]])
['T', 'EH', 'K', 'S', 'T', '-', 'T', 'AH', '-', 'S', 'P', 'IY', 'CH', '!']
"""
@abstractmethod
def get_vocoder(self, *, dl_kwargs=None) -> Vocoder:
"""Create a vocoder module, based off of either WaveRNN or GriffinLim.
If a pre-trained weight file is necessary,
:func:`torch.hub.load_state_dict_from_url` is used to downloaded it.
Args:
dl_kwargs (dictionary of keyword arguments):
Passed to :func:`torch.hub.load_state_dict_from_url`.
Returns:
Vocoder:
A vocoder module, which takes spectrogram Tensor and an optional
length Tensor, then returns resulting waveform Tensor and an optional
length Tensor.
"""
@abstractmethod
def get_tacotron2(self, *, dl_kwargs=None) -> Tacotron2:
"""Create a Tacotron2 model with pre-trained weight.
Args:
dl_kwargs (dictionary of keyword arguments):
Passed to :func:`torch.hub.load_state_dict_from_url`.
Returns:
Tacotron2:
The resulting model.
"""
|