GameServerZ

Paused

File size: 5,165 Bytes

864affd

import os
from pathlib import Path
from typing import List, Optional, Tuple, Union

import torch
from torch.utils.data import Dataset
from torchaudio.datasets.utils import _load_waveform


_SAMPLE_RATE = 16000
_SPEAKERS = [
    "Aditi",
    "Amy",
    "Brian",
    "Emma",
    "Geraint",
    "Ivy",
    "Joanna",
    "Joey",
    "Justin",
    "Kendra",
    "Kimberly",
    "Matthew",
    "Nicole",
    "Raveena",
    "Russell",
    "Salli",
]


def _load_labels(file: Path, subset: str):
    """Load transcirpt, iob, and intent labels for all utterances.



    Args:

        file (Path): The path to the label file.

        subset (str): Subset of the dataset to use. Options: [``"train"``, ``"valid"``, ``"test"``].



    Returns:

        Dictionary of labels, where the key is the filename of the audio,

            and the label is a Tuple of transcript, Inside–outside–beginning (IOB) label, and intention label.

    """
    labels = {}
    with open(file, "r") as f:
        for line in f:
            line = line.strip().split(" ")
            index = line[0]
            trans, iob_intent = " ".join(line[1:]).split("\t")
            trans = " ".join(trans.split(" ")[1:-1])
            iob = " ".join(iob_intent.split(" ")[1:-1])
            intent = iob_intent.split(" ")[-1]
            if subset in index:
                labels[index] = (trans, iob, intent)
    return labels


class Snips(Dataset):
    """*Snips* :cite:`coucke2018snips` dataset.



    Args:

        root (str or Path): Root directory where the dataset's top level directory is found.

        subset (str): Subset of the dataset to use. Options: [``"train"``, ``"valid"``, ``"test"``].

        speakers (List[str] or None, optional): The speaker list to include in the dataset. If ``None``,

            include all speakers in the subset. (Default: ``None``)

        audio_format (str, optional): The extension of the audios. Options: [``"mp3"``, ``"wav"``].

            (Default: ``"mp3"``)

    """

    _trans_file = "all.iob.snips.txt"

    def __init__(

        self,

        root: Union[str, Path],

        subset: str,

        speakers: Optional[List[str]] = None,

        audio_format: str = "mp3",

    ) -> None:
        if subset not in ["train", "valid", "test"]:
            raise ValueError('`subset` must be one of ["train", "valid", "test"].')
        if audio_format not in ["mp3", "wav"]:
            raise ValueError('`audio_format` must be one of ["mp3", "wav].')

        root = Path(root)
        self._path = root / "SNIPS"
        self.audio_path = self._path / subset
        if speakers is None:
            speakers = _SPEAKERS

        if not os.path.isdir(self._path):
            raise RuntimeError("Dataset not found.")

        self.audio_paths = self.audio_path.glob(f"*.{audio_format}")
        self.data = []
        for audio_path in sorted(self.audio_paths):
            audio_name = str(audio_path.name)
            speaker = audio_name.split("-")[0]
            if speaker in speakers:
                self.data.append(audio_path)
        transcript_path = self._path / self._trans_file
        self.labels = _load_labels(transcript_path, subset)

    def get_metadata(self, n: int) -> Tuple[str, int, str, str, str]:
        """Get metadata for the n-th sample from the dataset. Returns filepath instead of waveform,

        but otherwise returns the same fields as :py:func:`__getitem__`.



        Args:

            n (int): The index of the sample to be loaded.



        Returns:

            Tuple of the following items:



            str:

                Path to audio

            int:

                Sample rate

            str:

                File name

            str:

                Transcription of audio

            str:

                Inside–outside–beginning (IOB) label of transcription

            str:

                Intention label of the audio.

        """
        audio_path = self.data[n]
        relpath = os.path.relpath(audio_path, self._path)
        file_name = audio_path.with_suffix("").name
        transcript, iob, intent = self.labels[file_name]
        return relpath, _SAMPLE_RATE, file_name, transcript, iob, intent

    def __getitem__(self, n: int) -> Tuple[torch.Tensor, int, str, str, str]:
        """Load the n-th sample from the dataset.



        Args:

            n (int): The index of the sample to be loaded



        Returns:

            Tuple of the following items:



            Tensor:

                Waveform

            int:

                Sample rate

            str:

                File name

            str:

                Transcription of audio

            str:

                Inside–outside–beginning (IOB) label of transcription

            str:

                Intention label of the audio.

        """
        metadata = self.get_metadata(n)
        waveform = _load_waveform(self._path, metadata[0], metadata[1])
        return (waveform,) + metadata[1:]

    def __len__(self) -> int:
        return len(self.data)