""" Compute Voice Clarity Score from audio file """ import librosa import numpy as np from typing import Dict, Any from .vcs import calculate_voice_clarity_score, get_clarity_insight def compute_voice_clarity_score(file_path: str, whisper_model) -> Dict[str, Any]: """ Compute Voice Clarity Score and its components from a speech sample. Args: file_path (str): Path to the audio file. whisper_model: Transcription model (e.g., OpenAI Whisper or faster-whisper) Returns: dict: A dictionary containing Voice Clarity Score and component scores. """ # Transcribe audio result = whisper_model.transcribe(file_path) transcript = result.get("text", "").strip() segments = result.get("segments", []) # Validate early if not transcript or not segments: raise ValueError("Empty transcript or segments from Whisper.") # Load audio y, sr = librosa.load(file_path, sr=None) duration = len(y) / sr if sr else 0.0 if duration <= 0: raise ValueError("Audio duration invalid or zero.") # Calculate Voice Clarity Score clarity_result = calculate_voice_clarity_score(y, sr, segments) # Add transcript to results clarity_result["transcript"] = transcript # Add word count and duration info for reference word_count = len(transcript.split()) clarity_result["components"]["word_count"] = word_count clarity_result["components"]["duration"] = duration return clarity_result def analyze_voice_quality(file_path: str, whisper_model) -> Dict[str, Any]: """ Comprehensive voice quality analysis including clarity. Args: file_path (str): Path to the audio file whisper_model: Transcription model Returns: Dict[str, Any]: Complete voice quality analysis """ # Get Voice Clarity Score clarity_results = compute_voice_clarity_score(file_path, whisper_model) vcs = clarity_results["VCS"] # Load audio for additional analysis y, sr = librosa.load(file_path, sr=None) # Calculate additional voice quality metrics # Voice stability - based on pitch (F0) stability f0, voiced_flags, voiced_probs = librosa.pyin( y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan) voiced_f0 = f0[~np.isnan(f0)] pitch_stability = 0.0 if voiced_f0.size > 0: # Calculate coefficient of variation (lower is more stable) cv = np.std(voiced_f0) / np.mean(voiced_f0) if np.mean(voiced_f0) > 0 else float('inf') # Convert to score (0-100) pitch_stability = max(0, min(100, 100 - (cv * 100))) # Voice resonance - based on spectral bandwidth bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr)) # Normalize (ideal range is around 1500-2500 Hz for speech) if bandwidth < 1000: resonance_score = max(0, bandwidth / 1000 * 70) # Too narrow elif bandwidth <= 2500: resonance_score = 70 + ((bandwidth - 1000) / 1500 * 30) # Optimal range else: resonance_score = max(0, 100 - ((bandwidth - 2500) / 2500 * 50)) # Too wide # Voice strength - based on RMS energy rms = np.mean(librosa.feature.rms(y=y)) # Normalize (typical speech RMS values range from 0.01 to 0.2) strength_score = min(100, max(0, rms / 0.2 * 100)) # Combine additional metrics additional_metrics = { "pitch_stability": pitch_stability, "voice_resonance": resonance_score, "voice_strength": strength_score } # Add to results combined_results = { "VCS": vcs, "insight": clarity_results["insight"], "components": { **clarity_results["components"], **additional_metrics }, "transcript": clarity_results["transcript"] } return combined_results # Ensure the functions are exposed when imported __all__ = ['compute_voice_clarity_score', 'analyze_voice_quality']