Spaces:
Paused
Paused
import os | |
from pathlib import Path | |
from typing import List, Tuple, Union | |
import torchaudio | |
from torch import Tensor | |
from torch.utils.data import Dataset | |
from torchaudio._internal import download_url_to_file | |
from torchaudio.datasets.librispeech import _get_librispeech_metadata | |
from torchaudio.datasets.utils import _extract_tar | |
_ARCHIVE_NAME = "librispeech_finetuning" | |
_URL = "https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz" | |
_CHECKSUM = "5d1efdc777b548194d7e09ba89126e2188026df9fd57aa57eb14408d2b2342af" | |
_SUBSET_MAP = {"10min": ["1h/0"], "1h": ["1h/*"], "10h": ["1h/*", "9h"]} | |
def _get_fileids_paths(path: Path, folders: List[str], _ext_audio: str) -> List[Tuple[str, str]]: | |
"""Get the file names and the corresponding file paths without `speaker_id` | |
and `chapter_id` directories. | |
The format of path is like: | |
{root}/{_ARCHIVE_NAME}/1h/[0-5]/[clean, other] or | |
{root}/{_ARCHIVE_NAME}/9h/[clean, other] | |
Args: | |
path (Path): Root path to the dataset. | |
folders (List[str]): Folders that contain the desired audio files. | |
_ext_audio (str): Extension of audio files. | |
Returns: | |
List[Tuple[str, str]]: | |
List of tuples where the first element is the relative path to the audio file. | |
The format of relative path is like: | |
1h/[0-5]/[clean, other] or 9h/[clean, other] | |
The second element is the file name without audio extension. | |
""" | |
path = Path(path) | |
files_paths = [] | |
for folder in folders: | |
paths = [p.relative_to(path) for p in path.glob(f"{folder}/*/*/*/*{_ext_audio}")] | |
files_paths += [(str(p.parent.parent.parent), str(p.stem)) for p in paths] # get subset folder and file name | |
files_paths.sort(key=lambda x: x[0] + x[1]) | |
return files_paths | |
class LibriLightLimited(Dataset): | |
"""Subset of Libri-light :cite:`librilight` dataset, | |
which was used in HuBERT :cite:`hsu2021hubert` for supervised fine-tuning. | |
Args: | |
root (str or Path): Path to the directory where the dataset is found or downloaded. | |
subset (str, optional): The subset to use. Options: [``"10min"``, ``"1h"``, ``"10h"``] | |
(Default: ``"10min"``). | |
download (bool, optional): | |
Whether to download the dataset if it is not found at root path. (default: ``False``). | |
""" | |
_ext_txt = ".trans.txt" | |
_ext_audio = ".flac" | |
def __init__( | |
self, | |
root: Union[str, Path], | |
subset: str = "10min", | |
download: bool = False, | |
) -> None: | |
if subset not in _SUBSET_MAP: | |
raise ValueError(f"`subset` must be one of {_SUBSET_MAP.keys()}. Found: {subset}") | |
folders = _SUBSET_MAP[subset] | |
root = os.fspath(root) | |
self._path = os.path.join(root, _ARCHIVE_NAME) | |
archive = os.path.join(root, f"{_ARCHIVE_NAME}.tgz") | |
if not os.path.isdir(self._path): | |
if not download: | |
raise RuntimeError("Dataset not found. Please use `download=True` to download") | |
if not os.path.isfile(archive): | |
download_url_to_file(_URL, archive, hash_prefix=_CHECKSUM) | |
_extract_tar(archive) | |
self._fileids_paths = _get_fileids_paths(self._path, folders, self._ext_audio) | |
def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]: | |
"""Load the n-th sample from the dataset. | |
Args: | |
n (int): The index of the sample to be loaded | |
Returns: | |
Tuple of the following items; | |
Tensor: | |
Waveform | |
int: | |
Sample rate | |
str: | |
Transcript | |
int: | |
Speaker ID | |
int: | |
Chapter ID | |
int: | |
Utterance ID | |
""" | |
file_path, fileid = self._fileids_paths[n] | |
metadata = _get_librispeech_metadata(fileid, self._path, file_path, self._ext_audio, self._ext_txt) | |
waveform, _ = torchaudio.load(os.path.join(self._path, metadata[0])) | |
return (waveform,) + metadata[1:] | |
def __len__(self) -> int: | |
return len(self._fileids_paths) | |