Spaces:

AIGC-Audio
/

AudioGPT

Build error

+import numpy as np
+import scipy.linalg
+from scipy.spatial.transform import Rotation as R
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as F
+from src.warping import GeometricTimeWarper, MonotoneTimeWarper
+from src.utils import Net
+class GeometricWarper(nn.Module):
+    def __init__(self, sampling_rate=48000):
+        super().__init__()
+        self.warper = GeometricTimeWarper(sampling_rate=sampling_rate)
+    def _transmitter_mouth(self, view):
+        # offset between tracking markers and real mouth position in the dataset
+        mouth_offset = np.array([0.09, 0, -0.20])
+        quat = view[:, 3:, :].transpose(2, 1).contiguous().detach().cpu().view(-1, 4).numpy()
+        # make sure zero-padded values are set to non-zero values (else scipy raises an exception)
+        norms = scipy.linalg.norm(quat, axis=1)
+        eps_val = (norms == 0).astype(np.float32)
+        quat = quat + eps_val[:, None]
+        transmitter_rot_mat = R.from_quat(quat)
+        transmitter_mouth = transmitter_rot_mat.apply(mouth_offset, inverse=True)
+        transmitter_mouth = th.Tensor(transmitter_mouth).view(view.shape[0], -1, 3).transpose(2, 1).contiguous()
+        if view.is_cuda:
+            transmitter_mouth = transmitter_mouth.cuda()
+        return transmitter_mouth
+    def _3d_displacements(self, view):
+        transmitter_mouth = self._transmitter_mouth(view)
+        # offset between tracking markers and ears in the dataset
+        left_ear_offset = th.Tensor([0, -0.08, -0.22]).cuda() if view.is_cuda else th.Tensor([0, -0.08, -0.22])
+        right_ear_offset = th.Tensor([0, 0.08, -0.22]).cuda() if view.is_cuda else th.Tensor([0, 0.08, -0.22])
+        # compute displacements between transmitter mouth and receiver left/right ear
+        displacement_left = view[:, 0:3, :] + transmitter_mouth - left_ear_offset[None, :, None]
+        displacement_right = view[:, 0:3, :] + transmitter_mouth - right_ear_offset[None, :, None]
+        displacement = th.stack([displacement_left, displacement_right], dim=1)
+        return displacement
+    def _warpfield(self, view, seq_length):
+        return self.warper.displacements2warpfield(self._3d_displacements(view), seq_length)
+    def forward(self, mono, view):
+        '''
+        :param mono: input signal as tensor of shape B x 1 x T
+        :param view: rx/tx position/orientation as tensor of shape B x 7 x K (K = T / 400)
+        :return: warped: warped left/right ear signal as tensor of shape B x 2 x T
+        '''
+        return self.warper(th.cat([mono, mono], dim=1), self._3d_displacements(view))
+class Warpnet(nn.Module):
+    def __init__(self, layers=4, channels=64, view_dim=7):
+        super().__init__()
+        self.layers = [nn.Conv1d(view_dim if l == 0 else channels, channels, kernel_size=2) for l in range(layers)]
+        self.layers = nn.ModuleList(self.layers)
+        self.linear = nn.Conv1d(channels, 2, kernel_size=1)
+        self.neural_warper = MonotoneTimeWarper()
+        self.geometric_warper = GeometricWarper()
+    def neural_warpfield(self, view, seq_length):
+        warpfield = view
+        for layer in self.layers:
+            warpfield = F.pad(warpfield, pad=[1, 0])
+            warpfield = F.relu(layer(warpfield))
+        warpfield = self.linear(warpfield)
+        warpfield = F.interpolate(warpfield, size=seq_length)
+        return warpfield
+    def forward(self, mono, view):
+        '''
+        :param mono: input signal as tensor of shape B x 1 x T
+        :param view: rx/tx position/orientation as tensor of shape B x 7 x K (K = T / 400)
+        :return: warped: warped left/right ear signal as tensor of shape B x 2 x T
+        '''
+        geometric_warpfield = self.geometric_warper._warpfield(view, mono.shape[-1])
+        neural_warpfield = self.neural_warpfield(view, mono.shape[-1])
+        warpfield = geometric_warpfield + neural_warpfield
+        # ensure causality
+        warpfield = -F.relu(-warpfield) # the predicted warp
+        warped = self.neural_warper(th.cat([mono, mono], dim=1), warpfield)
+        return warped
+class BinauralNetwork(Net):
+    def __init__(self,
+                 view_dim=7,
+                 warpnet_layers=4,
+                 warpnet_channels=64,
+                 model_name='binaural_network',
+                 use_cuda=True):
+        super().__init__(model_name, use_cuda)
+        self.warper = Warpnet(warpnet_layers, warpnet_channels)
+        if self.use_cuda:
+            self.cuda()
+    def forward(self, mono, view):
+        '''
+        :param mono: the input signal as a B x 1 x T tensor
+        :param view: the receiver/transmitter position as a B x 7 x T tensor
+        :return: out: the binaural output produced by the network
+                 intermediate: a two-channel audio signal obtained from the output of each intermediate layer
+                               as a list of B x 2 x T tensors
+        '''
+        # print('mono ', mono.shape)
+        # print('view ', view.shape)
+        warped = self.warper(mono, view)
+        # print('warped ', warped.shape)
+        return warped

mono2binaural/src/utils.py ADDED Viewed

	@@ -0,0 +1,251 @@

+"""
+Copyright (c) Facebook, Inc. and its affiliates.
+All rights reserved.
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+"""
+import numpy as np
+import torch as th
+#import torchaudio as ta
+class Net(th.nn.Module):
+    def __init__(self, model_name="network", use_cuda=True):
+        super().__init__()
+        self.use_cuda = use_cuda
+        self.model_name = model_name
+    def save(self, model_dir, suffix=''):
+        '''
+        save the network to model_dir/model_name.suffix.net
+        :param model_dir: directory to save the model to
+        :param suffix: suffix to append after model name
+        '''
+        if self.use_cuda:
+            self.cpu()
+        if suffix == "":
+            fname = f"{model_dir}/{self.model_name}.net"
+        else:
+            fname = f"{model_dir}/{self.model_name}.{suffix}.net"
+        th.save(self.state_dict(), fname)
+        if self.use_cuda:
+            self.cuda()
+    def load_from_file(self, model_file):
+        '''
+        load network parameters from model_file
+        :param model_file: file containing the model parameters
+        '''
+        if self.use_cuda:
+            self.cpu()
+        states = th.load(model_file)
+        self.load_state_dict(states)
+        if self.use_cuda:
+            self.cuda()
+        print(f"Loaded: {model_file}")
+    def load(self, model_dir, suffix=''):
+        '''
+        load network parameters from model_dir/model_name.suffix.net
+        :param model_dir: directory to load the model from
+        :param suffix: suffix to append after model name
+        '''
+        if suffix == "":
+            fname = f"{model_dir}/{self.model_name}.net"
+        else:
+            fname = f"{model_dir}/{self.model_name}.{suffix}.net"
+        self.load_from_file(fname)
+    def num_trainable_parameters(self):
+        '''
+        :return: the number of trainable parameters in the model
+        '''
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+# class NewbobAdam(th.optim.Adam):
+#     def __init__(self,
+#                  weights,
+#                  net,
+#                  artifacts_dir,
+#                  initial_learning_rate=0.001,
+#                  decay=0.5,
+#                  max_decay=0.01
+#                  ):
+#         '''
+#         Newbob learning rate scheduler
+#         :param weights: weights to optimize
+#         :param net: the network, must be an instance of type src.utils.Net
+#         :param artifacts_dir: (str) directory to save/restore models to/from
+#         :param initial_learning_rate: (float) initial learning rate
+#         :param decay: (float) value to decrease learning rate by when loss doesn't improve further
+#         :param max_decay: (float) maximum decay of learning rate
+#         '''
+#         super().__init__(weights, lr=initial_learning_rate)
+#         self.last_epoch_loss = np.inf
+#         self.total_decay = 1
+#         self.net = net
+#         self.decay = decay
+#         self.max_decay = max_decay
+#         self.artifacts_dir = artifacts_dir
+#         # store initial state as backup
+#         if decay < 1.0:
+#             net.save(artifacts_dir, suffix="newbob")
+#     def update_lr(self, loss):
+#         '''
+#         update the learning rate based on the current loss value and historic loss values
+#         :param loss: the loss after the current iteration
+#         '''
+#         if loss > self.last_epoch_loss and self.decay < 1.0 and self.total_decay > self.max_decay:
+#             self.total_decay = self.total_decay * self.decay
+#             print(f"NewbobAdam: Decay learning rate (loss degraded from {self.last_epoch_loss} to {loss})."
+#                   f"Total decay: {self.total_decay}")
+#             # restore previous network state
+#             self.net.load(self.artifacts_dir, suffix="newbob")
+#             # decrease learning rate
+#             for param_group in self.param_groups:
+#                 param_group['lr'] = param_group['lr'] * self.decay
+#         else:
+#             self.last_epoch_loss = loss
+#         # save last snapshot to restore it in case of lr decrease
+#         if self.decay < 1.0 and self.total_decay > self.max_decay:
+#             self.net.save(self.artifacts_dir, suffix="newbob")
+# class FourierTransform:
+#     def __init__(self,
+#                  fft_bins=2048,
+#                  win_length_ms=40,
+#                  frame_rate_hz=100,
+#                  causal=False,
+#                  preemphasis=0.0,
+#                  sample_rate=48000,
+#                  normalized=False):
+#         self.sample_rate = sample_rate
+#         self.frame_rate_hz = frame_rate_hz
+#         self.preemphasis = preemphasis
+#         self.fft_bins = fft_bins
+#         self.win_length = int(sample_rate * win_length_ms / 1000)
+#         self.hop_length = int(sample_rate / frame_rate_hz)
+#         self.causal = causal
+#         self.normalized = normalized
+#         if self.win_length > self.fft_bins:
+#             print('FourierTransform Warning: fft_bins should be larger than win_length')
+#     def _convert_format(self, data, expected_dims):
+#         if not type(data) == th.Tensor:
+#             data = th.Tensor(data)
+#         if len(data.shape) < expected_dims:
+#             data = data.unsqueeze(0)
+#         if not len(data.shape) == expected_dims:
+#             raise Exception(f"FourierTransform: data needs to be a Tensor with {expected_dims} dimensions but got shape {data.shape}")
+#         return data
+#     def _preemphasis(self, audio):
+#         if self.preemphasis > 0:
+#             return th.cat((audio[:, 0:1], audio[:, 1:] - self.preemphasis * audio[:, :-1]), dim=1)
+#         return audio
+#     def _revert_preemphasis(self, audio):
+#         if self.preemphasis > 0:
+#             for i in range(1, audio.shape[1]):
+#                 audio[:, i] = audio[:, i] + self.preemphasis * audio[:, i-1]
+#         return audio
+#     def _magphase(self, complex_stft):
+#         mag, phase = ta.functional.magphase(complex_stft, 1.0)
+#         return mag, phase
+#     def stft(self, audio):
+#         '''
+#         wrapper around th.stft
+#         audio: wave signal as th.Tensor
+#         '''
+#         hann = th.hann_window(self.win_length)
+#         hann = hann.cuda() if audio.is_cuda else hann
+#         spec = th.stft(audio, n_fft=self.fft_bins, hop_length=self.hop_length, win_length=self.win_length,
+#                        window=hann, center=not self.causal, normalized=self.normalized)
+#         return spec.contiguous()
+#     def complex_spectrogram(self, audio):
+#         '''
+#         audio: wave signal as th.Tensor
+#         return: th.Tensor of size channels x frequencies x time_steps (channels x y_axis x x_axis)
+#         '''
+#         self._convert_format(audio, expected_dims=2)
+#         audio = self._preemphasis(audio)
+#         return self.stft(audio)
+#     def magnitude_phase(self, audio):
+#         '''
+#         audio: wave signal as th.Tensor
+#         return: tuple containing two th.Tensor of size channels x frequencies x time_steps for magnitude and phase spectrum
+#         '''
+#         stft = self.complex_spectrogram(audio)
+#         return self._magphase(stft)
+#     def mag_spectrogram(self, audio):
+#         '''
+#         audio: wave signal as th.Tensor
+#         return: magnitude spectrum as th.Tensor of size channels x frequencies x time_steps for magnitude and phase spectrum
+#         '''
+#         return self.magnitude_phase(audio)[0]
+#     def power_spectrogram(self, audio):
+#         '''
+#         audio: wave signal as th.Tensor
+#         return: power spectrum as th.Tensor of size channels x frequencies x time_steps for magnitude and phase spectrum
+#         '''
+#         return th.pow(self.mag_spectrogram(audio), 2.0)
+#     def phase_spectrogram(self, audio):
+#         '''
+#         audio: wave signal as th.Tensor
+#         return: phase spectrum as th.Tensor of size channels x frequencies x time_steps for magnitude and phase spectrum
+#         '''
+#         return self.magnitude_phase(audio)[1]
+#     def mel_spectrogram(self, audio, n_mels):
+#         '''
+#         audio: wave signal as th.Tensor
+#         n_mels: number of bins used for mel scale warping
+#         return: mel spectrogram as th.Tensor of size channels x n_mels x time_steps for magnitude and phase spectrum
+#         '''
+#         spec = self.power_spectrogram(audio)
+#         mel_warping = ta.transforms.MelScale(n_mels, self.sample_rate)
+#         return mel_warping(spec)
+#     def complex_spec2wav(self, complex_spec, length):
+#         '''
+#         inverse stft
+#         complex_spec: complex spectrum as th.Tensor of size channels x frequencies x time_steps x 2 (real part/imaginary part)
+#         length: length of the audio to be reconstructed (in frames)
+#         '''
+#         complex_spec = self._convert_format(complex_spec, expected_dims=4)
+#         hann = th.hann_window(self.win_length)
+#         hann = hann.cuda() if complex_spec.is_cuda else hann
+#         wav = ta.functional.istft(complex_spec, n_fft=self.fft_bins, hop_length=self.hop_length, win_length=self.win_length, window=hann, length=length, center=not self.causal)
+#         wav = self._revert_preemphasis(wav)
+#         return wav
+#     def magphase2wav(self, mag_spec, phase_spec, length):
+#         '''
+#         reconstruction of wav signal from magnitude and phase spectrum
+#         mag_spec: magnitude spectrum as th.Tensor of size channels x frequencies x time_steps
+#         phase_spec: phase spectrum as th.Tensor of size channels x frequencies x time_steps
+#         length: length of the audio to be reconstructed (in frames)
+#         '''
+#         mag_spec = self._convert_format(mag_spec, expected_dims=3)
+#         phase_spec = self._convert_format(phase_spec, expected_dims=3)
+#         complex_spec = th.stack([mag_spec * th.cos(phase_spec), mag_spec * th.sin(phase_spec)], dim=-1)
+#         return self.complex_spec2wav(complex_spec, length)

mono2binaural/src/warping.py ADDED Viewed

	@@ -0,0 +1,113 @@

+"""
+Copyright (c) Facebook, Inc. and its affiliates.
+All rights reserved.
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+"""
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as F
+class TimeWarperFunction(th.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, warpfield):
+        '''
+        :param ctx: autograd context
+        :param input: input signal (B x 2 x T)
+        :param warpfield: the corresponding warpfield (B x 2 x T)
+        :return: the warped signal (B x 2 x T)
+        '''
+        ctx.save_for_backward(input, warpfield)
+        # compute index list to lookup warped input values
+        idx_left = warpfield.floor().type(th.long)
+        idx_right = th.clamp(warpfield.ceil().type(th.long), max=input.shape[-1]-1)
+        # compute weight for linear interpolation
+        alpha = warpfield - warpfield.floor()
+        # linear interpolation
+        output = (1 - alpha) * th.gather(input, 2, idx_left) + alpha * th.gather(input, 2, idx_right)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, warpfield = ctx.saved_tensors
+        # compute index list to lookup warped input values
+        idx_left = warpfield.floor().type(th.long)
+        idx_right = th.clamp(warpfield.ceil().type(th.long), max=input.shape[-1]-1)
+        # warpfield gradient
+        grad_warpfield = th.gather(input, 2, idx_right) - th.gather(input, 2, idx_left)
+        grad_warpfield = grad_output * grad_warpfield
+        # input gradient
+        grad_input = th.zeros(input.shape, device=input.device)
+        alpha = warpfield - warpfield.floor()
+        grad_input = grad_input.scatter_add(2, idx_left, grad_output * (1 - alpha)) + \
+                     grad_input.scatter_add(2, idx_right, grad_output * alpha)
+        return grad_input, grad_warpfield
+class TimeWarper(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.warper = TimeWarperFunction().apply
+    def _to_absolute_positions(self, warpfield, seq_length):
+        # translate warpfield from relative warp indices to absolute indices ([1...T] + warpfield)
+        temp_range = th.arange(seq_length, dtype=th.float)
+        temp_range = temp_range.cuda() if warpfield.is_cuda else temp_range
+        return th.clamp(warpfield + temp_range[None, None, :], min=0, max=seq_length-1)
+    def forward(self, input, warpfield):
+        '''
+        :param input: audio signal to be warped (B x 2 x T)
+        :param warpfield: the corresponding warpfield (B x 2 x T)
+        :return: the warped signal (B x 2 x T)
+        '''
+        warpfield = self._to_absolute_positions(warpfield, input.shape[-1])
+        warped = self.warper(input, warpfield)
+        return warped
+class MonotoneTimeWarper(TimeWarper):
+    def forward(self, input, warpfield):
+        '''
+        :param input: audio signal to be warped (B x 2 x T)
+        :param warpfield: the corresponding warpfield (B x 2 x T)
+        :return: the warped signal (B x 2 x T), ensured to be monotonous
+        '''
+        warpfield = self._to_absolute_positions(warpfield, input.shape[-1])
+        # ensure monotonicity: each warp must be at least as big as previous_warp-1
+        warpfield = th.cummax(warpfield, dim=-1)[0]
+        # print('warpfield ',warpfield.shape)
+        # warp
+        warped = self.warper(input, warpfield)
+        return warped
+class GeometricTimeWarper(TimeWarper):
+    def __init__(self, sampling_rate=48000):
+        super().__init__()
+        self.sampling_rate = sampling_rate
+    def displacements2warpfield(self, displacements, seq_length):
+        distance = th.sum(displacements**2, dim=2) ** 0.5
+        distance = F.interpolate(distance, size=seq_length)
+        warpfield = -distance / 343.0 * self.sampling_rate
+        return warpfield
+    def forward(self, input, displacements):
+        '''
+        :param input: audio signal to be warped (B x 2 x T)
+        :param displacements: sequence of 3D displacement vectors for geometric warping (B x 3 x T)
+        :return: the warped signal (B x 2 x T)
+        '''
+        warpfield = self.displacements2warpfield(displacements, input.shape[-1])
+        # print('Ge warpfield ', warpfield.shape)
+        # assert 1==2
+        warped = super().forward(input, warpfield)
+        return warped

mono2binaural/useful_ckpts/m2b/binaural_network.net ADDED Viewed

Binary file (107 kB). View file

mono2binaural/useful_ckpts/m2b/tx_positions.txt ADDED Viewed