submission-template

Sleeping

File size: 8,511 Bytes

import numpy as np
import librosa
import torch
import torch.nn as nn
# import pywt
from scipy import signal



def compute_cwt_power_spectrum(audio, sample_rate, num_freqs=128, f_min=20, f_max=None):
    """
    Compute the power spectrum of continuous wavelet transform using Morlet wavelet.

    Parameters:
        audio: torch.Tensor
            Input audio signal
        sample_rate: int
            Sampling rate of the audio
        num_freqs: int
            Number of frequency bins for the CWT
        f_min: float
            Minimum frequency to analyze
        f_max: float or None
            Maximum frequency to analyze (defaults to Nyquist frequency)

    Returns:
        torch.Tensor: CWT power spectrum
    """
    # Convert to numpy
    audio_np = audio.cpu().numpy()

    # Set default f_max to Nyquist frequency if not specified
    if f_max is None:
        f_max = sample_rate // 2

    # Generate frequency bins (logarithmically spaced)
    frequencies = np.logspace(
        np.log10(f_min),
        np.log10(f_max),
        num=num_freqs
    )

    # Compute the width of the wavelet (in samples)
    widths = sample_rate / (2 * frequencies * np.pi)

    # Compute CWT using Morlet wavelet
    cwt = signal.cwt(
        audio_np,
        signal.morlet2,
        widths,
        w=5.0  # Width parameter of Morlet wavelet
    )

    # Compute power spectrum (magnitude squared)
    power_spectrum = np.abs(cwt) ** 2

    # Convert to torch tensor
    power_spectrum_tensor = torch.FloatTensor(power_spectrum)

    return power_spectrum_tensor

# def compute_wavelet_transform(audio, wavelet, decompos_level):
#     """Compute wavelet decomposition of the audio signal."""
#     # Convert to numpy and ensure 1D
#     audio_np = audio.cpu().numpy()
#
#     # Perform wavelet decomposition
#     coeffs = pywt.wavedec(audio_np, wavelet, level=decompos_level)
#
#     # Stack coefficients into a 2D array
#     # First, pad all coefficient arrays to the same length
#     max_len = max(len(c) for c in coeffs)
#     padded_coeffs = []
#     for coeff in coeffs:
#         pad_len = max_len - len(coeff)
#         if pad_len > 0:
#             padded_coeff = np.pad(coeff, (0, pad_len), mode='constant')
#         else:
#             padded_coeff = coeff
#         padded_coeffs.append(padded_coeff)
#
#     # Stack into 2D array where each row is a different scale
#     wavelet_features = np.stack(padded_coeffs)
#
#     # Convert to tensor
#     return torch.FloatTensor(wavelet_features)


def compute_melspectrogram(audio, sample_rate):
    mel_spec = librosa.feature.melspectrogram(
        y=audio.cpu().numpy(),
        sr=sample_rate,
        n_mels=128
    )
    return torch.FloatTensor(librosa.power_to_db(mel_spec))


def compute_mfcc(audio, sample_rate):
    mfcc = librosa.feature.mfcc(
        y=audio.cpu().numpy(),
        sr=sample_rate,
        n_mfcc=20
    )
    return torch.FloatTensor(mfcc)


def compute_chroma(audio, sample_rate):
    chroma = librosa.feature.chroma_stft(
        y=audio.cpu().numpy(),
        sr=sample_rate
    )
    return torch.FloatTensor(chroma)


def compute_time_domain_features(audio, sample_rate, frame_length=2048, hop_length=128):
    """
    Compute time-domain features from audio signal.
    Returns a dictionary of features.
    """
    # Convert to numpy
    audio_np = audio.cpu().numpy()

    # Initialize dictionary for features
    features = {}

    # 1. Zero Crossing Rate
    zcr = librosa.feature.zero_crossing_rate(
        y=audio_np,
        frame_length=frame_length,
        hop_length=hop_length
    )
    features['zcr'] = torch.Tensor([zcr.sum()])

    # 2. Root Mean Square Energy
    rms = librosa.feature.rms(
        y=audio_np,
        frame_length=frame_length,
        hop_length=hop_length
    )
    features['rms_energy'] = torch.Tensor([rms.mean()])

    # 3. Temporal Statistics
    frames = librosa.util.frame(audio_np, frame_length=frame_length, hop_length=hop_length)
    features['mean'] = torch.Tensor([np.mean(frames, axis=0).mean()])
    features['std'] = torch.Tensor([np.std(frames, axis=0).mean()])
    features['max'] = torch.Tensor([np.max(frames, axis=0).mean()])

    # 4. Tempo and Beat Features
    onset_env = librosa.onset.onset_strength(y=audio_np, sr=sample_rate)
    tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sample_rate)
    features['tempo'] = torch.Tensor(tempo)

    # 5. Amplitude Envelope
    envelope = np.abs(librosa.stft(audio_np, n_fft=frame_length, hop_length=hop_length))
    features['envelope'] = torch.Tensor([np.mean(envelope, axis=0).mean()])

    return features


def compute_frequency_domain_features(audio, sample_rate, n_fft=2048, hop_length=512):
    """
    Compute frequency-domain features from audio signal.
    Returns a dictionary of features.
    """
    # Convert to numpy
    audio_np = audio.cpu().numpy()

    # Initialize dictionary for features
    features = {}

    # 1. Spectral Centroid
    try:
        spectral_centroids = librosa.feature.spectral_centroid(
            y=audio_np,
            sr=sample_rate,
            n_fft=n_fft,
            hop_length=hop_length,

        )
        features['spectral_centroid'] = torch.FloatTensor([spectral_centroids.max()])
    except Exception as e:
        features['spectral_centroid'] = torch.FloatTensor([np.nan])

    # 2. Spectral Rolloff
    try:
        spectral_rolloff = librosa.feature.spectral_rolloff(
            y=audio_np,
            sr=sample_rate,
            n_fft=n_fft,
            hop_length=hop_length,

        )
        features['spectral_rolloff'] = torch.FloatTensor([spectral_rolloff.max()])
    except Exception as e:
        features['spectral_rolloff'] = torch.FloatTensor([np.nan])

    # 3. Spectral Bandwidth
    try:
        spectral_bandwidth = librosa.feature.spectral_bandwidth(
            y=audio_np,
            sr=sample_rate,
            n_fft=n_fft,
            hop_length=hop_length
        )
        features['spectral_bandwidth'] = torch.FloatTensor([spectral_bandwidth.max()])
    except Exception as e:
        features['spectral_bandwidth'] = torch.FloatTensor([np.nan])
    # 4. Spectral Contrast
    try:
        spectral_contrast = librosa.feature.spectral_contrast(
            y=audio_np,
            sr=sample_rate,
            n_fft=n_fft,
            hop_length=hop_length,
            fmin=20,  # Lower minimum frequency
            n_bands=4,  # Reduce number of bands
            quantile=0.02
        )
        features['spectral_contrast'] = torch.FloatTensor([spectral_contrast.mean()])
    except Exception as e:
        features['spectral_contrast'] = torch.FloatTensor([np.nan])

    # 5. Spectral Flatness
    try:
        spectral_flatness = librosa.feature.spectral_flatness(
            y=audio_np,
            n_fft=n_fft,
            hop_length=hop_length
        )
        features['spectral_flatness'] = torch.FloatTensor([spectral_flatness.max()])
    except Exception as e:
        features['spectral_flatness'] = torch.FloatTensor([np.nan])

    # 6. Spectral Flux
    try:
        stft = np.abs(librosa.stft(audio_np, n_fft=n_fft, hop_length=hop_length))
        spectral_flux = np.diff(stft, axis=1)
        spectral_flux = np.pad(spectral_flux, ((0, 0), (1, 0)), mode='constant')
        features['spectral_flux'] = torch.FloatTensor([np.std(spectral_flux)])
    except Exception as e:
        features['spectral_flux'] = torch.FloatTensor([np.nan])

    return features


def compute_all_features(audio, sample_rate, wavelet='db1', decompos_level=4):
    """
    Compute all available features and return them in a dictionary.
    """
    features = {}

    # Basic transformations
    # features['wavelet'] = compute_wavelet_transform(audio, wavelet, decompos_level)
    # features['melspectrogram'] = compute_melspectrogram(audio, sample_rate)
    # features['mfcc'] = compute_mfcc(audio, sample_rate)
    # features['chroma'] = compute_chroma(audio, sample_rate)

    # features['cwt_power'] = compute_cwt_power_spectrum(
    #     audio,
    #     sample_rate,
    #     num_freqs=128,  # Same as mel bands for consistency
    #     f_min=20,  # Standard lower frequency bound
    #     f_max=sample_rate // 2  # Nyquist frequency
    # )

    # Time domain features
    # features['time_domain'] = compute_time_domain_features(audio, sample_rate)

    # Frequency domain features
    features['frequency_domain'] = compute_frequency_domain_features(audio, sample_rate)

    return features