Spaces:
Runtime error
Runtime error
| import parselmouth | |
| from parselmouth.praat import call | |
| import numpy as np | |
| import math | |
| class VoiceQualityFeatureExtractor: | |
| """ | |
| A class to extract various voice quality features from audio data. | |
| Attributes: | |
| audio_arr (numpy.array): The audio array used for processing. | |
| orig_sr (int): The original sampling rate of the audio. | |
| Methods: | |
| extract(features_to_extract=None): Main method to extract specified voice quality features. | |
| extract_jitter(): Extracts measures of frequency variation (jitter). | |
| extract_shimmer(): Extracts measures of amplitude variation (shimmer). | |
| extract_hnr(): Extracts the Harmonics-to-Noise Ratio (HNR). | |
| extract_speech_rate(): Calculates various speech rate metrics. | |
| measure_speech_rate(voiceID): Helper method to perform detailed speech rate analysis. | |
| """ | |
| def __init__(self, audio_arr, orig_sr): | |
| """ | |
| Initializes the VoiceQualityFeatureExtractor with audio data. | |
| """ | |
| self.audio_arr = audio_arr | |
| self.orig_sr = orig_sr | |
| def extract(self, features_to_extract=None): | |
| """ | |
| Extracts specified voice quality features from the audio data. | |
| Args: | |
| features_to_extract (list of str, optional): A list of feature names to extract. | |
| Defaults to extracting all available features if None. | |
| Returns: | |
| dict: A dictionary containing the extracted features. | |
| """ | |
| feature_funcs = { | |
| 'jitter': self.extract_jitter, | |
| 'shimmer': self.extract_shimmer, | |
| 'hnr': self.extract_hnr, | |
| 'speech_rate': self.extract_speech_rate | |
| } | |
| if features_to_extract is None: | |
| features_to_extract = feature_funcs.keys() | |
| features = {} | |
| for feature in features_to_extract: | |
| if feature in feature_funcs: | |
| feature_values = feature_funcs[feature]() | |
| if isinstance(feature_values, dict): | |
| features.update(feature_values) | |
| else: | |
| features[feature] = feature_values | |
| return features | |
| def extract_jitter(self): | |
| """ | |
| Extracts jitter measures from the audio data. | |
| """ | |
| try: | |
| snd = parselmouth.Sound(self.audio_arr, sampling_frequency=self.orig_sr) | |
| point_process = call(snd, "To PointProcess (periodic, cc)", 75, 500) | |
| jitter_local = call(point_process, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3) | |
| jitter_rap = call(point_process, "Get jitter (rap)", 0, 0, 0.0001, 0.02, 1.3) | |
| jitter_ppq5 = call(point_process, "Get jitter (ppq5)", 0, 0, 0.0001, 0.02, 1.3) | |
| return { | |
| 'jitter_local': jitter_local, | |
| 'jitter_rap': jitter_rap, | |
| 'jitter_ppq5': jitter_ppq5 | |
| } | |
| except Exception as e: | |
| print(f'Error extracting jitter: {e}') | |
| return { | |
| 'jitter_local': np.nan, | |
| 'jitter_rap': np.nan, | |
| 'jitter_ppq5': np.nan | |
| } | |
| def extract_shimmer(self): | |
| """ | |
| Extracts shimmer measures from the audio data. | |
| """ | |
| try: | |
| snd = parselmouth.Sound(self.audio_arr, sampling_frequency=self.orig_sr) | |
| point_process = call(snd, "To PointProcess (periodic, cc)", 75, 500) | |
| shimmer_local = call([snd, point_process], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6) | |
| shimmer_apq3 = call([snd, point_process], "Get shimmer (apq3)", 0, 0, 0.0001, 0.02, 1.3, 1.6) | |
| shimmer_apq5 = call([snd, point_process], "Get shimmer (apq5)", 0, 0, 0.0001, 0.02, 1.3, 1.6) | |
| shimmer_dda = call([snd, point_process], "Get shimmer (dda)", 0, 0, 0.0001, 0.02, 1.3, 1.6) | |
| return { | |
| 'shimmer_local': shimmer_local, | |
| 'shimmer_apq3': shimmer_apq3, | |
| 'shimmer_apq5': shimmer_apq5, | |
| 'shimmer_dda': shimmer_dda | |
| } | |
| except Exception as e: | |
| print(f'Error extracting shimmer: {e}') | |
| return { | |
| 'shimmer_local': np.nan, | |
| 'shimmer_apq3': np.nan, | |
| 'shimmer_apq5': np.nan, | |
| 'shimmer_dda': np.nan | |
| } | |
| def extract_hnr(self): | |
| """ | |
| Extracts the Harmonics-to-Noise Ratio (HNR) from the audio data. | |
| """ | |
| try: | |
| snd = parselmouth.Sound(self.audio_arr, sampling_frequency=self.orig_sr) | |
| harmonicity = call(snd, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0) | |
| hnr = call(harmonicity, "Get mean", 0, 0) | |
| return {'hnr': hnr} | |
| except Exception as e: | |
| print(f'Error extracting HNR: {e}') | |
| return {'hnr': np.nan} | |
| def extract_speech_rate(self): | |
| """ | |
| Calculates and extracts various metrics related to speech rate. | |
| """ | |
| try: | |
| sound = parselmouth.Sound(self.audio_arr, sampling_frequency=self.orig_sr) | |
| (voicedcount, npause, originaldur, intensity_duration, speakingrate, articulationrate, asd, totalpauseduration) = self.measure_speech_rate(sound) | |
| return { | |
| 'voicedcount': voicedcount, | |
| 'npause': npause, | |
| 'originaldur': originaldur, | |
| 'intensity_duration': intensity_duration, | |
| 'speakingrate': speakingrate, | |
| 'articulationrate': articulationrate, | |
| 'asd': asd, | |
| 'totalpauseduration': totalpauseduration | |
| } | |
| except Exception as e: | |
| print(f'Error extracting speech rate: {e}') | |
| return { | |
| 'voicedcount': np.nan, | |
| 'npause': np.nan, | |
| 'originaldur': np.nan, | |
| 'intensity_duration': np.nan, | |
| 'speakingrate': np.nan, | |
| 'articulationrate': np.nan, | |
| 'asd': np.nan, | |
| 'totalpauseduration': np.nan | |
| } | |
| def measure_speech_rate(self, voiceID): | |
| """ | |
| Performs a detailed analysis to measure various speech rate metrics from the given audio. | |
| This method calculates metrics like the number of voiced segments, number of pauses, | |
| the total original duration of the audio, the duration of voiced segments, speaking rate, | |
| articulation rate, average syllable duration, and the total duration of pauses. | |
| """ | |
| silencedb = -25 | |
| mindip = 2 | |
| minpause = 0.3 | |
| sound = parselmouth.Sound(voiceID) | |
| originaldur = sound.get_total_duration() | |
| intensity = sound.to_intensity(50) | |
| start = call(intensity, "Get time from frame number", 1) | |
| nframes = call(intensity, "Get number of frames") | |
| end = call(intensity, "Get time from frame number", nframes) | |
| min_intensity = call(intensity, "Get minimum", 0, 0, "Parabolic") | |
| max_intensity = call(intensity, "Get maximum", 0, 0, "Parabolic") | |
| # get .99 quantile to get maximum (without influence of non-speech sound bursts) | |
| max_99_intensity = call(intensity, "Get quantile", 0, 0, 0.99) | |
| # estimate Intensity threshold | |
| threshold = max_99_intensity + silencedb | |
| threshold2 = max_intensity - max_99_intensity | |
| threshold3 = silencedb - threshold2 | |
| if threshold < min_intensity: | |
| threshold = min_intensity | |
| # get pauses (silences) and speakingtime | |
| textgrid = call(intensity, "To TextGrid (silences)", threshold3, minpause, 0.1, "silent", "sounding") | |
| silencetier = call(textgrid, "Extract tier", 1) | |
| silencetable = call(silencetier, "Down to TableOfReal", "sounding") | |
| npauses = call(silencetable, "Get number of rows") | |
| speakingtot = 0 | |
| for ipause in range(npauses): | |
| pause = ipause + 1 | |
| beginsound = call(silencetable, "Get value", pause, 1) | |
| endsound = call(silencetable, "Get value", pause, 2) | |
| speakingdur = endsound - beginsound | |
| speakingtot += speakingdur | |
| total_pause_duration = originaldur - speakingtot | |
| intensity_matrix = call(intensity, "Down to Matrix") | |
| sound_from_intensity_matrix = call(intensity_matrix, "To Sound (slice)", 1) | |
| intensity_duration = call(sound_from_intensity_matrix, "Get total duration") | |
| intensity_max = call(sound_from_intensity_matrix, "Get maximum", 0, 0, "Parabolic") | |
| point_process = call(sound_from_intensity_matrix, "To PointProcess (extrema)", "Left", "yes", "no", "Sinc70") | |
| numpeaks = call(point_process, "Get number of points") | |
| t = [call(point_process, "Get time from index", i + 1) for i in range(numpeaks)] | |
| timepeaks = [] | |
| peakcount = 0 | |
| intensities = [] | |
| for i in range(numpeaks): | |
| value = call(sound_from_intensity_matrix, "Get value at time", t[i], "Cubic") | |
| if value > threshold: | |
| peakcount += 1 | |
| intensities.append(value) | |
| timepeaks.append(t[i]) | |
| validpeakcount = 0 | |
| currenttime = timepeaks[0] | |
| currentint = intensities[0] | |
| validtime = [] | |
| for p in range(peakcount - 1): | |
| following = p + 1 | |
| followingtime = timepeaks[p + 1] | |
| dip = call(intensity, "Get minimum", currenttime, timepeaks[p + 1], "None") | |
| diffint = abs(currentint - dip) | |
| if diffint > mindip: | |
| validpeakcount += 1 | |
| validtime.append(timepeaks[p]) | |
| currenttime = timepeaks[following] | |
| currentint = call(intensity, "Get value at time", timepeaks[following], "Cubic") | |
| pitch = sound.to_pitch_ac(0.02, 30, 4, False, 0.03, 0.25, 0.01, 0.35, 0.25, 450) | |
| voicedcount = 0 | |
| voicedpeak = [] | |
| for time in range(validpeakcount): | |
| querytime = validtime[time] | |
| whichinterval = call(textgrid, "Get interval at time", 1, querytime) | |
| whichlabel = call(textgrid, "Get label of interval", 1, whichinterval) | |
| value = pitch.get_value_at_time(querytime) | |
| if not math.isnan(value): | |
| if whichlabel == "sounding": | |
| voicedcount += 1 | |
| voicedpeak.append(validtime[time]) | |
| timecorrection = originaldur / intensity_duration | |
| call(textgrid, "Insert point tier", 1, "syllables") | |
| for i in range(len(voicedpeak)): | |
| position = (voicedpeak[i] * timecorrection) | |
| call(textgrid, "Insert point", 1, position, "") | |
| speakingrate = voicedcount / originaldur | |
| # Handling division by zero for articulationrate | |
| if speakingtot != 0: | |
| articulationrate = voicedcount / speakingtot | |
| else: | |
| articulationrate = float('nan') | |
| # Handling division by zero for asd | |
| if voicedcount != 0: | |
| asd = speakingtot / voicedcount | |
| else: | |
| asd = float('nan') | |
| npause = npauses - 1 | |
| return voicedcount, npause, originaldur, intensity_duration, speakingrate, articulationrate, asd, total_pause_duration | |