import librosa import parselmouth from parselmouth.praat import call import numpy as np class ProsodicFeatureExtractor: """ A class for extracting various prosodic features from audio data. Attributes: y (numpy.array): Audio time series. sr (int): Sampling rate of the audio time series. audio_arr (numpy.array): Original audio array for parselmouth processing. orig_sr (int): Original sampling rate of the audio array Methods: extract(features_to_extract=None): Extracts specified prosodic features from audio. extract_f0(): Extracts fundamental frequency (F0) from audio. extract_energy(): Extracts energy from audio. extract_speaking_rate(): Estimates the speaking rate from audio. extract_pauses(): Detects pauses from audio. extract_formants(): Extracts formant frequencies from audio. """ def __init__(self, y, sr, audio_arr, orig_sr): """ Initializes the ProsodicFeatureExtractor with audio data. """ self.y = y self.sr = sr self.audio_arr = audio_arr self.orig_sr = orig_sr def extract(self, features_to_extract=None): """ Extracts the specified prosodic features. Args: features_to_extract (list, optional): List of feature names to extract. Defaults to all available features if None. Returns: dict: A dictionary containing the extracted features. """ feature_funcs = { 'f0': self.extract_f0, 'energy': self.extract_energy, 'speaking_rate': self.extract_speaking_rate, 'pauses': self.extract_pauses, 'formants': self.extract_formants } if features_to_extract is None: features_to_extract = feature_funcs.keys() features = {} for feature in features_to_extract: if feature in feature_funcs: result = feature_funcs[feature]() if isinstance(result, tuple): features.update(result) else: features[feature] = result return features def extract_f0(self): """ Extracts the fundamental frequency (F0) using PYIN algorithm. """ f0, voiced_flag, voiced_probs = librosa.pyin(self.y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7')) f0 = np.nan_to_num(f0) return f0 def extract_energy(self): """ Extracts the root-mean-square (RMS) energy from the audio. """ return librosa.feature.rms(y=self.y)[0] def extract_speaking_rate(self): """ Estimates the speaking rate by calculating the number of syllables per second. """ try: snd = parselmouth.Sound(self.audio_arr, sampling_frequency=self.orig_sr) total_duration = snd.get_total_duration() intensity = snd.to_intensity() intensity_values = intensity.values.T threshold = 0.3 * max(intensity_values) syllable_count = len([1 for i in intensity_values if i > threshold]) speaking_rate = syllable_count / total_duration return speaking_rate except Exception as e: print(f'Error extracting speaking rate: {e}') return None def extract_pauses(self): """ Identifies and timestamps pauses in the audio. """ try: snd = parselmouth.Sound(self.audio_arr, sampling_frequency=self.orig_sr) silences = call(snd, "To TextGrid (silences)", 100, 0, -25, 0.1, 0.1, "silent", "sounding") pauses = [(call(silences, "Get start time of interval", 1, i), call(silences, "Get end time of interval", 1, i)) for i in range(1, call(silences, "Get number of intervals", 1) + 1) if call(silences, "Get label of interval", 1, i) == "silent"] return pauses except Exception as e: print(f'Error extracting pauses: {e}') return None def extract_formants(self): """ Extracts the first three formant frequencies using the Burg method. """ try: snd = parselmouth.Sound(self.audio_arr, sampling_frequency=self.orig_sr) formant = call(snd, "To Formant (burg)", 0.025, 5, 5500, 0.025, 50) formant_values = {} for i in range(1, 4): # Extracting the first three formants formant_values[f'F{i}_mean'] = call(formant, "Get mean", i, 0, 0, "Hertz") formant_values[f'F{i}_stdev'] = call(formant, "Get standard deviation", i, 0, 0, "Hertz") return formant_values except Exception as e: print(f'Error extracting formants: {e}') return {}