from .vers import calc_vers import librosa import numpy as np import math from .filler_analyzer import detect_fillers from .find_valence import get_valence_score def compute_vers_score(file_path: str, whisper_model) -> dict: """ Compute VERS (Vocal Emotional Regulation Score) and its components from a speech sample. """ result = whisper_model.transcribe(file_path) transcript = result.get("text", "").strip() segments = result.get("segments", []) # Filler count filler_count, _ = detect_fillers(transcript) # Load audio y, sr = librosa.load(file_path, sr=None) duration = len(y) / sr if sr else 0.0 # Volume (RMS) rms = librosa.feature.rms(y=y)[0] mean_rms = float(np.mean(rms)) mean_volume_db = 20 * math.log10(mean_rms + 1e-6) if mean_rms > 0 else -80.0 volume_std = np.std(20 * np.log10(rms + 1e-6)) # Max volume vol_max = np.max(np.abs(y)) if y.size > 0 else 0.0 vol_max_db = 20 * math.log10(vol_max + 1e-6) if vol_max > 0 else -80.0 # Pitch variation f0, voiced_flags, voiced_probs = librosa.pyin( y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan) voiced_f0 = f0[~np.isnan(f0)] pitch_variation = 0.0 if voiced_f0.size > 0: median_f0 = np.nanmedian(voiced_f0) median_f0 = max(median_f0, 1e-6) semitone_diffs = 12 * np.log2(voiced_f0 / median_f0) pitch_variation = float(np.nanstd(semitone_diffs)) # Pause analysis total_speaking_time = 0.0 long_pause_count = 0 if segments: for seg in segments: total_speaking_time += (seg["end"] - seg["start"]) for i in range(len(segments) - 1): pause_dur = segments[i+1]["start"] - segments[i]["end"] if pause_dur > 1.0: long_pause_count += 1 first_start = segments[0]["start"] last_end = segments[-1]["end"] if first_start > 1.0: long_pause_count += 1 if duration - last_end > 1.0: long_pause_count += 1 # WPM words = transcript.split() word_count = len(words) words_per_min = (word_count / duration) * 60.0 if duration > 0 else 0.0 valence_scores = get_valence_score(file_path) # Calculate VERS vers_result = calc_vers( filler_count=filler_count, long_pause_count=long_pause_count, pitch_variation=pitch_variation, mean_volume_db=mean_volume_db, vol_max_db=vol_max_db, wpm=words_per_min, volume_std=volume_std, valence_scores=valence_scores ) # Include transcript optionally vers_result["transcript"] = transcript return vers_result