File size: 4,529 Bytes
e11256b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""
Audio Processor Module
=======================

This module provides the AudioProcessor class, utilizing PyTorchaudio for handling audio files.
It includes functionalities to load, cut, and manage audio waveforms, offering efficient and
flexible audio processing.

Available Classes:
- AudioProcessor: Processes audio waveforms and provides methods for loading, 
                    cutting, and handling audio.

Usage:
    from .audio_import AudioProcessor

    processor = AudioProcessor.from_file("path/to/audiofile.wav")
    cut_waveform = processor.cut(start=1.0, end=5.0)

Constants:
- SAMPLE_RATE (int): Default sample rate for processing.
- NORMALIZATION_FACTOR (float): Normalization factor for audio waveform.
"""

from subprocess import CalledProcessError, run
import numpy as np
import torch

SAMPLE_RATE = 16000
NORMALIZATION_FACTOR = 32768.0


class AudioProcessor:
    """
    Audio Processor class that leverages PyTorchaudio to provide functionalities
    for loading, cutting, and handling audio waveforms.

    Attributes:
        waveform: torch.Tensor
            The audio waveform tensor.
        sr: int
            The sample rate of the audio.
    """

    def __init__(self, waveform: torch.Tensor,
                 sr: int = SAMPLE_RATE) -> None:
        """
        Initialize the AudioProcessor object.

        Args:
            waveform (torch.Tensor): The audio waveform tensor.
            sr (int, optional): The sample rate of the audio. Defaults to SAMPLE_RATE.

        Raises:
            ValueError: If the provided sample rate is not of type int.
        """

        self.waveform = waveform
        self.sr = sr

        if not isinstance(self.sr, int):
            raise ValueError("Sample rate should be a single value of type int,"
                             f"not {len(self.sr)} and type {type(self.sr)}")

    @classmethod
    def from_file(cls, file: str, *args, **kwargs) -> 'AudioProcessor':
        """
        Create an AudioProcessor instance from an audio file.

        Args:
            file (str): The audio file path.

        Returns:
            AudioProcessor: An instance of the AudioProcessor class containing the loaded audio.
        """

        audio, sr = cls.load_audio(file, *args, **kwargs)

        audio = torch.from_numpy(audio)

        return cls(audio, sr)

    def cut(self, start: float, end: float) -> torch.Tensor:
        """
        Cut a segment from the audio waveform between the specified start and end times.

        Args:
            start (float): Start time in seconds.
            end (float): End time in seconds.

        Returns:
            torch.Tensor: The cut waveform segment.
        """

        start = int(start * self.sr)
        if (isinstance(end, float) or isinstance(end, int)) and isinstance(self.sr, int):
            end = int(np.ceil(end * self.sr))
        else:
            end = int(torch.ceil(end * self.sr))
        return self.waveform[start:end]

    @staticmethod
    def load_audio(file: str, sr: int = SAMPLE_RATE):
        """
        Open an audio file and read it as a mono waveform, resampling if necessary.
        This method ensures compatibility with pyannote.audio
        and requires the ffmpeg CLI in PATH.

        Args:
            file (str): The audio file to open.
            sr (int, optional): The desired sample rate. Defaults to SAMPLE_RATE.

        Returns:
            tuple: A NumPy array containing the audio waveform in float32 dtype
                    and the sample rate.

        Raises:
            RuntimeError: If failed to load audio.
        """
        # This launches a subprocess to decode audio while down-mixing
        # and resampling as necessary.  Requires the ffmpeg CLI in PATH.
        # fmt: off
        cmd = [
            "ffmpeg",
            "-nostdin",
            "-threads", "0",
            "-i", file,
            "-f", "s16le",
            "-ac", "1",
            "-acodec", "pcm_s16le",
            "-ar", str(sr),
            "-"
        ]
        # fmt: on
        try:
            out = run(cmd, capture_output=True, check=True).stdout
        except CalledProcessError as e:
            raise RuntimeError(
                f"Failed to load audio: {e.stderr.decode()}") from e

        out = np.frombuffer(out, np.int16).flatten().astype(
            np.float32) / NORMALIZATION_FACTOR

        return out, sr
    
    def __repr__(self) -> str:
        return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'