Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,235 Bytes
9172422 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
import math
import random
import torch
from torch import nn
from typing import Tuple
class PadCrop(nn.Module):
def __init__(self, n_samples, randomize=True):
super().__init__()
self.n_samples = n_samples
self.randomize = randomize
def __call__(self, signal):
n, s = signal.shape
start = 0 if (not self.randomize) else torch.randint(0, max(0, s - self.n_samples) + 1, []).item()
end = start + self.n_samples
output = signal.new_zeros([n, self.n_samples])
output[:, :min(s, self.n_samples)] = signal[:, start:end]
return output
class PadCrop_Normalized_T(nn.Module):
def __init__(self, n_samples: int, sample_rate: int, randomize: bool = True):
super().__init__()
self.n_samples = n_samples
self.sample_rate = sample_rate
self.randomize = randomize
def __call__(self, source: torch.Tensor) -> Tuple[torch.Tensor, float, float, int, int]:
n_channels, n_samples = source.shape
# If the audio is shorter than the desired length, pad it
upper_bound = max(0, n_samples - self.n_samples)
# If randomize is False, always start at the beginning of the audio
offset = 0
if(self.randomize and n_samples > self.n_samples):
offset = random.randint(0, upper_bound)
# Calculate the start and end times of the chunk
t_start = offset / (upper_bound + self.n_samples)
t_end = (offset + self.n_samples) / (upper_bound + self.n_samples)
# Create the chunk
chunk = source.new_zeros([n_channels, self.n_samples])
# Copy the audio into the chunk
chunk[:, :min(n_samples, self.n_samples)] = source[:, offset:offset + self.n_samples]
# Calculate the start and end times of the chunk in seconds
seconds_start = math.floor(offset / self.sample_rate)
seconds_total = math.ceil(n_samples / self.sample_rate)
# Create a mask the same length as the chunk with 1s where the audio is and 0s where it isn't
padding_mask = torch.zeros([self.n_samples])
padding_mask[:min(n_samples, self.n_samples)] = 1
return (
chunk,
t_start,
t_end,
seconds_start,
seconds_total,
padding_mask
)
class PhaseFlipper(nn.Module):
"Randomly invert the phase of a signal"
def __init__(self, p=0.5):
super().__init__()
self.p = p
def __call__(self, signal):
return -signal if (random.random() < self.p) else signal
class Mono(nn.Module):
def __call__(self, signal):
return torch.mean(signal, dim=0, keepdims=True) if len(signal.shape) > 1 else signal
class Stereo(nn.Module):
def __call__(self, signal):
signal_shape = signal.shape
# Check if it's mono
if len(signal_shape) == 1: # s -> 2, s
signal = signal.unsqueeze(0).repeat(2, 1)
elif len(signal_shape) == 2:
if signal_shape[0] == 1: #1, s -> 2, s
signal = signal.repeat(2, 1)
elif signal_shape[0] > 2: #?, s -> 2,s
signal = signal[:2, :]
return signal
|