Spaces:
Runtime error
Runtime error
import librosa | |
import parselmouth | |
from parselmouth.praat import call | |
import numpy as np | |
class ProsodicFeatureExtractor: | |
""" | |
A class for extracting various prosodic features from audio data. | |
Attributes: | |
y (numpy.array): Audio time series. | |
sr (int): Sampling rate of the audio time series. | |
audio_arr (numpy.array): Original audio array for parselmouth processing. | |
orig_sr (int): Original sampling rate of the audio array | |
Methods: | |
extract(features_to_extract=None): Extracts specified prosodic features from audio. | |
extract_f0(): Extracts fundamental frequency (F0) from audio. | |
extract_energy(): Extracts energy from audio. | |
extract_speaking_rate(): Estimates the speaking rate from audio. | |
extract_pauses(): Detects pauses from audio. | |
extract_formants(): Extracts formant frequencies from audio. | |
""" | |
def __init__(self, y, sr, audio_arr, orig_sr): | |
""" | |
Initializes the ProsodicFeatureExtractor with audio data. | |
""" | |
self.y = y | |
self.sr = sr | |
self.audio_arr = audio_arr | |
self.orig_sr = orig_sr | |
def extract(self, features_to_extract=None): | |
""" | |
Extracts the specified prosodic features. | |
Args: | |
features_to_extract (list, optional): List of feature names to extract. | |
Defaults to all available features if None. | |
Returns: | |
dict: A dictionary containing the extracted features. | |
""" | |
feature_funcs = { | |
'f0': self.extract_f0, | |
'energy': self.extract_energy, | |
'speaking_rate': self.extract_speaking_rate, | |
'pauses': self.extract_pauses, | |
'formants': self.extract_formants | |
} | |
if features_to_extract is None: | |
features_to_extract = feature_funcs.keys() | |
features = {} | |
for feature in features_to_extract: | |
if feature in feature_funcs: | |
result = feature_funcs[feature]() | |
if isinstance(result, tuple): | |
features.update(result) | |
else: | |
features[feature] = result | |
return features | |
def extract_f0(self): | |
""" | |
Extracts the fundamental frequency (F0) using PYIN algorithm. | |
""" | |
f0, voiced_flag, voiced_probs = librosa.pyin(self.y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7')) | |
f0 = np.nan_to_num(f0) | |
return f0 | |
def extract_energy(self): | |
""" | |
Extracts the root-mean-square (RMS) energy from the audio. | |
""" | |
return librosa.feature.rms(y=self.y)[0] | |
def extract_speaking_rate(self): | |
""" | |
Estimates the speaking rate by calculating the number of syllables per second. | |
""" | |
try: | |
snd = parselmouth.Sound(self.audio_arr, sampling_frequency=self.orig_sr) | |
total_duration = snd.get_total_duration() | |
intensity = snd.to_intensity() | |
intensity_values = intensity.values.T | |
threshold = 0.3 * max(intensity_values) | |
syllable_count = len([1 for i in intensity_values if i > threshold]) | |
speaking_rate = syllable_count / total_duration | |
return speaking_rate | |
except Exception as e: | |
print(f'Error extracting speaking rate: {e}') | |
return None | |
def extract_pauses(self): | |
""" | |
Identifies and timestamps pauses in the audio. | |
""" | |
try: | |
snd = parselmouth.Sound(self.audio_arr, sampling_frequency=self.orig_sr) | |
silences = call(snd, "To TextGrid (silences)", 100, 0, -25, 0.1, 0.1, "silent", "sounding") | |
pauses = [(call(silences, "Get start time of interval", 1, i), call(silences, "Get end time of interval", 1, i)) for i in range(1, call(silences, "Get number of intervals", 1) + 1) if call(silences, "Get label of interval", 1, i) == "silent"] | |
return pauses | |
except Exception as e: | |
print(f'Error extracting pauses: {e}') | |
return None | |
def extract_formants(self): | |
""" | |
Extracts the first three formant frequencies using the Burg method. | |
""" | |
try: | |
snd = parselmouth.Sound(self.audio_arr, sampling_frequency=self.orig_sr) | |
formant = call(snd, "To Formant (burg)", 0.025, 5, 5500, 0.025, 50) | |
formant_values = {} | |
for i in range(1, 4): # Extracting the first three formants | |
formant_values[f'F{i}_mean'] = call(formant, "Get mean", i, 0, 0, "Hertz") | |
formant_values[f'F{i}_stdev'] = call(formant, "Get standard deviation", i, 0, 0, "Hertz") | |
return formant_values | |
except Exception as e: | |
print(f'Error extracting formants: {e}') | |
return {} |