Spaces:
Running
Running
import os | |
from functools import lru_cache | |
from typing import BinaryIO, Dict, Optional, Tuple, Type, Union | |
import torch | |
from torchaudio._extension import lazy_import_sox_ext | |
from torchaudio.io import CodecConfig | |
from torio._extension import lazy_import_ffmpeg_ext | |
from . import soundfile_backend | |
from .backend import Backend | |
from .common import AudioMetaData | |
from .ffmpeg import FFmpegBackend | |
from .soundfile import SoundfileBackend | |
from .sox import SoXBackend | |
def get_available_backends() -> Dict[str, Type[Backend]]: | |
backend_specs: Dict[str, Type[Backend]] = {} | |
if lazy_import_ffmpeg_ext().is_available(): | |
backend_specs["ffmpeg"] = FFmpegBackend | |
if lazy_import_sox_ext().is_available(): | |
backend_specs["sox"] = SoXBackend | |
if soundfile_backend._IS_SOUNDFILE_AVAILABLE: | |
backend_specs["soundfile"] = SoundfileBackend | |
return backend_specs | |
def get_backend(backend_name, backends) -> Backend: | |
if backend := backends.get(backend_name): | |
return backend | |
else: | |
raise ValueError( | |
f"Unsupported backend '{backend_name}' specified; ", | |
f"please select one of {list(backends.keys())} instead.", | |
) | |
def get_info_func(): | |
backends = get_available_backends() | |
def dispatcher( | |
uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str] | |
) -> Backend: | |
if backend_name is not None: | |
return get_backend(backend_name, backends) | |
for backend in backends.values(): | |
if backend.can_decode(uri, format): | |
return backend | |
raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.") | |
def info( | |
uri: Union[BinaryIO, str, os.PathLike], | |
format: Optional[str] = None, | |
buffer_size: int = 4096, | |
backend: Optional[str] = None, | |
) -> AudioMetaData: | |
"""Get signal information of an audio file. | |
Note: | |
When the input type is file-like object, this function cannot | |
get the correct length (``num_samples``) for certain formats, | |
such as ``vorbis``. | |
In this case, the value of ``num_samples`` is ``0``. | |
Args: | |
uri (path-like object or file-like object): | |
Source of audio data. The following types are accepted: | |
* ``path-like``: File path or URL. | |
* ``file-like``: Object with ``read(size: int) -> bytes`` method, | |
which returns byte string of at most ``size`` length. | |
format (str or None, optional): | |
If not ``None``, interpreted as hint that may allow backend to override the detected format. | |
(Default: ``None``) | |
buffer_size (int, optional): | |
Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``) | |
backend (str or None, optional): | |
I/O backend to use. | |
If ``None``, function selects backend given input and available backends. | |
Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``], | |
with the corresponding backend available. | |
(Default: ``None``) | |
.. seealso:: | |
:ref:`backend` | |
Returns: | |
AudioMetaData | |
""" | |
backend = dispatcher(uri, format, backend) | |
return backend.info(uri, format, buffer_size) | |
return info | |
def get_load_func(): | |
backends = get_available_backends() | |
def dispatcher( | |
uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str] | |
) -> Backend: | |
if backend_name is not None: | |
return get_backend(backend_name, backends) | |
for backend in backends.values(): | |
if backend.can_decode(uri, format): | |
return backend | |
raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.") | |
def load( | |
uri: Union[BinaryIO, str, os.PathLike], | |
frame_offset: int = 0, | |
num_frames: int = -1, | |
normalize: bool = True, | |
channels_first: bool = True, | |
format: Optional[str] = None, | |
buffer_size: int = 4096, | |
backend: Optional[str] = None, | |
) -> Tuple[torch.Tensor, int]: | |
"""Load audio data from source. | |
By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with | |
``float32`` dtype, and the shape of `[channel, time]`. | |
Note: | |
The formats this function can handle depend on the availability of backends. | |
Please use the following functions to fetch the supported formats. | |
- FFmpeg: :py:func:`torchaudio.utils.ffmpeg_utils.get_audio_decoders` | |
- Sox: :py:func:`torchaudio.utils.sox_utils.list_read_formats` | |
- SoundFile: Refer to `the official document <https://pysoundfile.readthedocs.io/>`__. | |
.. warning:: | |
``normalize`` argument does not perform volume normalization. | |
It only converts the sample type to `torch.float32` from the native sample | |
type. | |
When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit | |
signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``, | |
this function can return integer Tensor, where the samples are expressed within the whole range | |
of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM, | |
``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not | |
support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors. | |
``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as | |
``flac`` and ``mp3``. | |
For these formats, this function always returns ``float32`` Tensor with values. | |
Args: | |
uri (path-like object or file-like object): | |
Source of audio data. | |
frame_offset (int, optional): | |
Number of frames to skip before start reading data. | |
num_frames (int, optional): | |
Maximum number of frames to read. ``-1`` reads all the remaining samples, | |
starting from ``frame_offset``. | |
This function may return the less number of frames if there is not enough | |
frames in the given file. | |
normalize (bool, optional): | |
When ``True``, this function converts the native sample type to ``float32``. | |
Default: ``True``. | |
If input file is integer WAV, giving ``False`` will change the resulting Tensor type to | |
integer type. | |
This argument has no effect for formats other than integer WAV type. | |
channels_first (bool, optional): | |
When True, the returned Tensor has dimension `[channel, time]`. | |
Otherwise, the returned Tensor's dimension is `[time, channel]`. | |
format (str or None, optional): | |
If not ``None``, interpreted as hint that may allow backend to override the detected format. | |
(Default: ``None``) | |
buffer_size (int, optional): | |
Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``) | |
backend (str or None, optional): | |
I/O backend to use. | |
If ``None``, function selects backend given input and available backends. | |
Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``], | |
with the corresponding backend being available. (Default: ``None``) | |
.. seealso:: | |
:ref:`backend` | |
Returns: | |
(torch.Tensor, int): Resulting Tensor and sample rate. | |
If the input file has integer wav format and normalization is off, then it has | |
integer type, else ``float32`` type. If ``channels_first=True``, it has | |
`[channel, time]` else `[time, channel]`. | |
""" | |
backend = dispatcher(uri, format, backend) | |
return backend.load(uri, frame_offset, num_frames, normalize, channels_first, format, buffer_size) | |
return load | |
def get_save_func(): | |
backends = get_available_backends() | |
def dispatcher( | |
uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str] | |
) -> Backend: | |
if backend_name is not None: | |
return get_backend(backend_name, backends) | |
for backend in backends.values(): | |
if backend.can_encode(uri, format): | |
return backend | |
raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.") | |
def save( | |
uri: Union[BinaryIO, str, os.PathLike], | |
src: torch.Tensor, | |
sample_rate: int, | |
channels_first: bool = True, | |
format: Optional[str] = None, | |
encoding: Optional[str] = None, | |
bits_per_sample: Optional[int] = None, | |
buffer_size: int = 4096, | |
backend: Optional[str] = None, | |
compression: Optional[Union[CodecConfig, float, int]] = None, | |
): | |
"""Save audio data to file. | |
Note: | |
The formats this function can handle depend on the availability of backends. | |
Please use the following functions to fetch the supported formats. | |
- FFmpeg: :py:func:`torchaudio.utils.ffmpeg_utils.get_audio_encoders` | |
- Sox: :py:func:`torchaudio.utils.sox_utils.list_write_formats` | |
- SoundFile: Refer to `the official document <https://pysoundfile.readthedocs.io/>`__. | |
Args: | |
uri (str or pathlib.Path): Path to audio file. | |
src (torch.Tensor): Audio data to save. must be 2D tensor. | |
sample_rate (int): sampling rate | |
channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`, | |
otherwise `[time, channel]`. | |
format (str or None, optional): Override the audio format. | |
When ``uri`` argument is path-like object, audio format is | |
inferred from file extension. If the file extension is missing or | |
different, you can specify the correct format with this argument. | |
When ``uri`` argument is file-like object, | |
this argument is required. | |
Valid values are ``"wav"``, ``"ogg"``, and ``"flac"``. | |
encoding (str or None, optional): Changes the encoding for supported formats. | |
This argument is effective only for supported formats, i.e. | |
``"wav"`` and ``""flac"```. Valid values are | |
- ``"PCM_S"`` (signed integer Linear PCM) | |
- ``"PCM_U"`` (unsigned integer Linear PCM) | |
- ``"PCM_F"`` (floating point PCM) | |
- ``"ULAW"`` (mu-law) | |
- ``"ALAW"`` (a-law) | |
bits_per_sample (int or None, optional): Changes the bit depth for the | |
supported formats. | |
When ``format`` is one of ``"wav"`` and ``"flac"``, | |
you can change the bit depth. | |
Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``. | |
buffer_size (int, optional): | |
Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``) | |
backend (str or None, optional): | |
I/O backend to use. | |
If ``None``, function selects backend given input and available backends. | |
Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``], | |
with the corresponding backend being available. | |
(Default: ``None``) | |
.. seealso:: | |
:ref:`backend` | |
compression (CodecConfig, float, int, or None, optional): | |
Compression configuration to apply. | |
If the selected backend is FFmpeg, an instance of :py:class:`CodecConfig` must be provided. | |
Otherwise, if the selected backend is SoX, a float or int value corresponding to option ``-C`` of the | |
``sox`` command line interface must be provided. For instance: | |
``"mp3"`` | |
Either bitrate (in ``kbps``) with quality factor, such as ``128.2``, or | |
VBR encoding with quality factor such as ``-4.2``. Default: ``-4.5``. | |
``"flac"`` | |
Whole number from ``0`` to ``8``. ``8`` is default and highest compression. | |
``"ogg"``, ``"vorbis"`` | |
Number from ``-1`` to ``10``; ``-1`` is the highest compression | |
and lowest quality. Default: ``3``. | |
Refer to http://sox.sourceforge.net/soxformat.html for more details. | |
""" | |
backend = dispatcher(uri, format, backend) | |
return backend.save( | |
uri, src, sample_rate, channels_first, format, encoding, bits_per_sample, buffer_size, compression | |
) | |
return save | |