import numpy as np from fairseq.data.audio.feature_transforms import ( AudioFeatureTransform, register_audio_feature_transform, ) @register_audio_feature_transform("utterance_cmvn") class UtteranceCMVN(AudioFeatureTransform): """Utterance-level CMVN (cepstral mean and variance normalization)""" @classmethod def from_config_dict(cls, config=None): _config = {} if config is None else config return UtteranceCMVN( _config.get("norm_means", True), _config.get("norm_vars", True), ) def __init__(self, norm_means=True, norm_vars=True): self.norm_means, self.norm_vars = norm_means, norm_vars def __repr__(self): return ( self.__class__.__name__ + f"(norm_means={self.norm_means}, norm_vars={self.norm_vars})" ) def __call__(self, x): mean = x.mean(axis=0) square_sums = (x ** 2).sum(axis=0) if self.norm_means: x = np.subtract(x, mean) if self.norm_vars: var = square_sums / x.shape[0] - mean ** 2 std = np.sqrt(np.maximum(var, 1e-10)) x = np.divide(x, std) return x