import scipy.signal import numpy as np import librosa import pyworld as pw # def compute_pitch_variation(file_path): # # Step 1: Load audio # y, sr = librosa.load(file_path, sr=None) # y = y.astype(np.float64) # pyworld expects float64 # # Step 2: Extract pitch (F0) # _f0, t = pw.dio(y, sr) # Fast initial pitch estimation # f0 = pw.stonemask(y, _f0, t, sr) # Refinement step # # Step 3: Filter voiced frames # voiced_f0 = f0[f0 > 0] # # Handle empty case # if voiced_f0.size == 0: # return { # "pitch_mean": 0.0, # "pitch_std": 0.0, # "pitch_range": 0.0, # "semitone_std": 0.0, # "pitch_variation_score": 0.0 # } # # Step 4: Basic statistics # pitch_mean = np.mean(voiced_f0) # pitch_std = np.std(voiced_f0) # pitch_range = np.max(voiced_f0) - np.min(voiced_f0) # print(pitch_mean) # print(f'voiced_f0: {voiced_f0}') # # Step 5: Compute semitone-based variation (better for human perception) # median_f0 = np.median(voiced_f0) # if median_f0 <= 0: # median_f0 = 1e-6 # Avoid division by zero # semitone_diffs = 12 * np.log2(voiced_f0 / median_f0) # semitone_std = np.std(semitone_diffs) # print(semitone_std) # # Step 6: Scale semitone_std to a 0–100 score (tunable) # # For example: semitone_std of 0 → 0 score, ≥6 semitones → 100 score # pitch_variation_score = np.clip((semitone_std / 6.0) * 100, 0, 100) # return { # "pitch_mean": pitch_mean, # "pitch_std": pitch_std, # "pitch_range": pitch_range, # "semitone_std": semitone_std, # "pitch_variation_score": pitch_variation_score # } # def compute_intonation_range(file_path): # # Step 1: Load and prepare audio # y, sr = librosa.load(file_path, sr=None) # y = y.astype(np.float64) # # Step 2: Extract F0 # _f0, t = pw.dio(y, sr) # f0 = pw.stonemask(y, _f0, t, sr) # # Step 3: Filter voiced frames # voiced_f0 = f0[f0 > 0] # if voiced_f0.size == 0: # return 0.0 # voiced_f0 = voiced_f0[(voiced_f0 > np.percentile(voiced_f0, 5)) & # (voiced_f0 < np.percentile(voiced_f0, 95))] # # Step 4: Compute intonation range (in semitones) # f0_min = np.min(voiced_f0) # f0_max = np.max(voiced_f0) # if f0_min <= 0: # f0_min = 1e-6 # to avoid log error # intonation_range = 12 * np.log2(f0_max / f0_min) # # range into scores: # max_range = 12.0 # normalized = min(intonation_range, max_range) / max_range # score = normalized * 100 # return round(score, 2), intonation_range # def compute_pitch_variation(file_path): # # Step 1: Load audio # y, sr = librosa.load(file_path, sr=None) # # Step 2: Extract pitch using librosa.pyin (YIN-based) # f0, voiced_flags, voiced_probs = librosa.pyin( # y, # sr=sr, # fmin=80, # fmax=400, # frame_length=1105, # hop_length=256, # fill_na=np.nan # ) # # Step 3: Filter voiced frames # voiced_f0 = f0[~np.isnan(f0)] # voiced_f0 = voiced_f0[ # (voiced_f0 > np.percentile(voiced_f0, 5)) & # (voiced_f0 < np.percentile(voiced_f0, 95)) # ] # # Handle empty case # if voiced_f0.size == 0: # return { # "pitch_mean": 0.0, # "pitch_std": 0.0, # "pitch_range": 0.0, # "semitone_std": 0.0, # "pitch_variation_score": 0.0 # } # # Step 4: Basic statistics # pitch_mean = float(np.mean(voiced_f0)) # pitch_std = float(np.std(voiced_f0)) # pitch_range = float(np.max(voiced_f0) - np.min(voiced_f0)) # # Step 5: Compute semitone-based variation # median_f0 = np.median(voiced_f0) # if median_f0 <= 0: # median_f0 = 1e-6 # semitone_diffs = 12 * np.log2(voiced_f0 / median_f0) # semitone_std = float(np.std(semitone_diffs)) # # Step 6: Scale to 0–100 score # pitch_variation_score = float(np.clip((semitone_std / 6.0) * 100, 0, 100)) # return { # "pitch_mean": pitch_mean, # "pitch_std": pitch_std, # "pitch_range": pitch_range, # "semitone_std": semitone_std, # "pitch_variation_score": pitch_variation_score # } # def compute_intonation_range(file_path): # # Step 1: Load and prepare audio # y, sr = librosa.load(file_path, sr=None) # # Step 2: Extract F0 using librosa.pyin # f0, voiced_flags, voiced_probs = librosa.pyin( # y, # sr=sr, # fmin=80, # fmax=400, # frame_length=1105, # ensures two periods of fmin fit # hop_length=256, # fill_na=np.nan # ) # # Step 3: Filter voiced frames # voiced_f0 = f0[~np.isnan(f0)] # if voiced_f0.size == 0: # return 0.0, 0.0 # # Optional: remove outliers (5th to 95th percentile) # voiced_f0 = voiced_f0[ # (voiced_f0 > np.percentile(voiced_f0, 5)) & # (voiced_f0 < np.percentile(voiced_f0, 95)) # ] # # Step 4: Compute intonation range in semitones # f0_min = np.min(voiced_f0) # f0_max = np.max(voiced_f0) # if f0_min <= 0: # f0_min = 1e-6 # intonation_range = 12 * np.log2(f0_max / f0_min) # # Step 5: Normalize and convert to score out of 100 # max_range = 12.0 # ~1 octave # normalized = min(intonation_range, max_range) / max_range # score = normalized * 100 # return round(score, 2), float(intonation_range) # def compute_speech_rhythm_variability(file_path): # """ # Computes the speech rhythm variability score from an audio file. # The method estimates tempo consistency across time using onset intervals. # Returns: # score (float): Normalized rhythm variability score out of 100. # raw_std (float): Raw standard deviation of inter-onset intervals. # """ # # Step 1: Load audio # y, sr = librosa.load(file_path, sr=None) # # Step 2: Onset detection # onset_env = librosa.onset.onset_strength(y=y, sr=sr) # onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time') # if len(onsets) < 2: # return 0.0, 0.0 # Not enough onsets to compute rhythm # # Step 3: Compute inter-onset intervals (IOIs) as rhythm proxy # iois = np.diff(onsets) # # Optional: Remove outliers (5th–95th percentile) # ioi_clean = iois[(iois > np.percentile(iois, 5)) & (iois < np.percentile(iois, 95))] # if len(ioi_clean) < 2: # return 0.0, 0.0 # # Step 4: Compute variability — standard deviation of IOIs # raw_std = np.std(ioi_clean) # # Step 5: Normalize raw_std to 0–100 score # # Lower std = more consistent rhythm → higher score # min_std = 0.05 # near-perfect rhythm (tight pacing) # max_std = 0.6 # highly irregular rhythm # # Clamp and reverse-score # clamped_std = np.clip(raw_std, min_std, max_std) # normalized = 1 - (clamped_std - min_std) / (max_std - min_std) # score = normalized * 100 # return round(score, 2), round(float(raw_std), 4) # def calc_sds(file_path): # # sds = 0.35 * pitch_variation + 0.35 * intonation_range + 0.3 * speech_rhythm_variability # pitch_variation = compute_pitch_variation(file_path) # intonation_range = compute_intonation_range(file_path) # speech_rhythm_variability = compute_speech_rhythm_variability(file_path) # # print(f"Speech Rhythm Variability Score: {speech_rhythm_variability}") # # print(f"Speech Rhythm Variability Score: {speech_rhythm_variability}") # # print(f"Speech Rhythm Variability Score: {speech_rhythm_variability}") # sds = 0.35 * pitch_variation['pitch_variation_score'] + 0.35 * intonation_range[0] + 0.3 * speech_rhythm_variability[0] # return round(sds, 2) # path = r'D:\Intern\shankh\audio_samples\anga.wav' # result = calc_sds(path) # print(f"SDS: {result}") import numpy as np import librosa import pyworld def compute_pitch_variation(file_path): # Step 1: Load audio y, sr = librosa.load(file_path, sr=None) # Step 2: Extract pitch using pyworld _f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr) f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr) # Step 3: Filter voiced frames voiced_f0 = f0[f0 > 0] # Remove outliers (5th to 95th percentile) voiced_f0 = voiced_f0[ (voiced_f0 > np.percentile(voiced_f0, 5)) & (voiced_f0 < np.percentile(voiced_f0, 95)) ] if voiced_f0.size == 0: return { "pitch_mean": 0.0, "pitch_std": 0.0, "pitch_range": 0.0, "semitone_std": 0.0, "pitch_variation_score": 0.0 } # Step 4: Basic statistics pitch_mean = float(np.mean(voiced_f0)) pitch_std = float(np.std(voiced_f0)) pitch_range = float(np.max(voiced_f0) - np.min(voiced_f0)) # Step 5: Semitone-based variation median_f0 = np.median(voiced_f0) if median_f0 <= 0: median_f0 = 1e-6 semitone_diffs = 12 * np.log2(voiced_f0 / median_f0) semitone_std = float(np.std(semitone_diffs)) # Step 6: Scaled variation score pitch_variation_score = float(np.clip((semitone_std / 6.0) * 100, 0, 100)) return { "pitch_mean": pitch_mean, "pitch_std": pitch_std, "pitch_range": pitch_range, "semitone_std": semitone_std, "pitch_variation_score": pitch_variation_score } def compute_intonation_range(file_path): # Step 1: Load audio y, sr = librosa.load(file_path, sr=None) # Step 2: Extract pitch using pyworld _f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr) f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr) # Step 3: Filter voiced frames voiced_f0 = f0[f0 > 0] if voiced_f0.size == 0: return 0.0, 0.0 # Remove outliers voiced_f0 = voiced_f0[ (voiced_f0 > np.percentile(voiced_f0, 5)) & (voiced_f0 < np.percentile(voiced_f0, 95)) ] if voiced_f0.size == 0: return 0.0, 0.0 # Step 4: Compute intonation range f0_min = np.min(voiced_f0) f0_max = np.max(voiced_f0) if f0_min <= 0: f0_min = 1e-6 intonation_range = 12 * np.log2(f0_max / f0_min) # Step 5: Normalize max_range = 12.0 normalized = min(intonation_range, max_range) / max_range score = normalized * 100 return round(score, 2), float(intonation_range) def compute_speech_rhythm_variability(file_path): """ Computes the speech rhythm variability score from an audio file. The method estimates tempo consistency across time using onset intervals. """ y, sr = librosa.load(file_path, sr=None) # Step 2: Onset detection onset_env = librosa.onset.onset_strength(y=y, sr=sr) onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time') if len(onsets) < 2: return 0.0, 0.0 iois = np.diff(onsets) ioi_clean = iois[(iois > np.percentile(iois, 5)) & (iois < np.percentile(iois, 95))] if len(ioi_clean) < 2: return 0.0, 0.0 raw_std = np.std(ioi_clean) min_std = 0.05 max_std = 0.6 clamped_std = np.clip(raw_std, min_std, max_std) normalized = 1 - (clamped_std - min_std) / (max_std - min_std) score = normalized * 100 return round(score, 2), round(float(raw_std), 4) def calc_sds(file_path): pitch_variation = compute_pitch_variation(file_path) intonation_range = compute_intonation_range(file_path) speech_rhythm_variability = compute_speech_rhythm_variability(file_path) sds = 0.35 * pitch_variation['pitch_variation_score'] + \ 0.35 * intonation_range[0] + \ 0.3 * speech_rhythm_variability[0] return round(sds, 2)