Ajay Karthick Senthil Kumar
add src
381c43b
import librosa
import parselmouth
from parselmouth.praat import call
import numpy as np
class ProsodicFeatureExtractor:
"""
A class for extracting various prosodic features from audio data.
Attributes:
y (numpy.array): Audio time series.
sr (int): Sampling rate of the audio time series.
audio_arr (numpy.array): Original audio array for parselmouth processing.
orig_sr (int): Original sampling rate of the audio array
Methods:
extract(features_to_extract=None): Extracts specified prosodic features from audio.
extract_f0(): Extracts fundamental frequency (F0) from audio.
extract_energy(): Extracts energy from audio.
extract_speaking_rate(): Estimates the speaking rate from audio.
extract_pauses(): Detects pauses from audio.
extract_formants(): Extracts formant frequencies from audio.
"""
def __init__(self, y, sr, audio_arr, orig_sr):
"""
Initializes the ProsodicFeatureExtractor with audio data.
"""
self.y = y
self.sr = sr
self.audio_arr = audio_arr
self.orig_sr = orig_sr
def extract(self, features_to_extract=None):
"""
Extracts the specified prosodic features.
Args:
features_to_extract (list, optional): List of feature names to extract.
Defaults to all available features if None.
Returns:
dict: A dictionary containing the extracted features.
"""
feature_funcs = {
'f0': self.extract_f0,
'energy': self.extract_energy,
'speaking_rate': self.extract_speaking_rate,
'pauses': self.extract_pauses,
'formants': self.extract_formants
}
if features_to_extract is None:
features_to_extract = feature_funcs.keys()
features = {}
for feature in features_to_extract:
if feature in feature_funcs:
result = feature_funcs[feature]()
if isinstance(result, tuple):
features.update(result)
else:
features[feature] = result
return features
def extract_f0(self):
"""
Extracts the fundamental frequency (F0) using PYIN algorithm.
"""
f0, voiced_flag, voiced_probs = librosa.pyin(self.y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
f0 = np.nan_to_num(f0)
return f0
def extract_energy(self):
"""
Extracts the root-mean-square (RMS) energy from the audio.
"""
return librosa.feature.rms(y=self.y)[0]
def extract_speaking_rate(self):
"""
Estimates the speaking rate by calculating the number of syllables per second.
"""
try:
snd = parselmouth.Sound(self.audio_arr, sampling_frequency=self.orig_sr)
total_duration = snd.get_total_duration()
intensity = snd.to_intensity()
intensity_values = intensity.values.T
threshold = 0.3 * max(intensity_values)
syllable_count = len([1 for i in intensity_values if i > threshold])
speaking_rate = syllable_count / total_duration
return speaking_rate
except Exception as e:
print(f'Error extracting speaking rate: {e}')
return None
def extract_pauses(self):
"""
Identifies and timestamps pauses in the audio.
"""
try:
snd = parselmouth.Sound(self.audio_arr, sampling_frequency=self.orig_sr)
silences = call(snd, "To TextGrid (silences)", 100, 0, -25, 0.1, 0.1, "silent", "sounding")
pauses = [(call(silences, "Get start time of interval", 1, i), call(silences, "Get end time of interval", 1, i)) for i in range(1, call(silences, "Get number of intervals", 1) + 1) if call(silences, "Get label of interval", 1, i) == "silent"]
return pauses
except Exception as e:
print(f'Error extracting pauses: {e}')
return None
def extract_formants(self):
"""
Extracts the first three formant frequencies using the Burg method.
"""
try:
snd = parselmouth.Sound(self.audio_arr, sampling_frequency=self.orig_sr)
formant = call(snd, "To Formant (burg)", 0.025, 5, 5500, 0.025, 50)
formant_values = {}
for i in range(1, 4): # Extracting the first three formants
formant_values[f'F{i}_mean'] = call(formant, "Get mean", i, 0, 0, "Hertz")
formant_values[f'F{i}_stdev'] = call(formant, "Get standard deviation", i, 0, 0, "Hertz")
return formant_values
except Exception as e:
print(f'Error extracting formants: {e}')
return {}