Voice / vcs /compute_vcs.py
mulasagg's picture
first
8031a8f
"""
Compute Voice Clarity Score from audio file
"""
import librosa
import numpy as np
from typing import Dict, Any
from .vcs import calculate_voice_clarity_score, get_clarity_insight
def compute_voice_clarity_score(file_path: str, whisper_model) -> Dict[str, Any]:
"""
Compute Voice Clarity Score and its components from a speech sample.
Args:
file_path (str): Path to the audio file.
whisper_model: Transcription model (e.g., OpenAI Whisper or faster-whisper)
Returns:
dict: A dictionary containing Voice Clarity Score and component scores.
"""
# Transcribe audio
result = whisper_model.transcribe(file_path)
transcript = result.get("text", "").strip()
segments = result.get("segments", [])
# Validate early
if not transcript or not segments:
raise ValueError("Empty transcript or segments from Whisper.")
# Load audio
y, sr = librosa.load(file_path, sr=None)
duration = len(y) / sr if sr else 0.0
if duration <= 0:
raise ValueError("Audio duration invalid or zero.")
# Calculate Voice Clarity Score
clarity_result = calculate_voice_clarity_score(y, sr, segments)
# Add transcript to results
clarity_result["transcript"] = transcript
# Add word count and duration info for reference
word_count = len(transcript.split())
clarity_result["components"]["word_count"] = word_count
clarity_result["components"]["duration"] = duration
return clarity_result
def analyze_voice_quality(file_path: str, whisper_model) -> Dict[str, Any]:
"""
Comprehensive voice quality analysis including clarity.
Args:
file_path (str): Path to the audio file
whisper_model: Transcription model
Returns:
Dict[str, Any]: Complete voice quality analysis
"""
# Get Voice Clarity Score
clarity_results = compute_voice_clarity_score(file_path, whisper_model)
vcs = clarity_results["VCS"]
# Load audio for additional analysis
y, sr = librosa.load(file_path, sr=None)
# Calculate additional voice quality metrics
# Voice stability - based on pitch (F0) stability
f0, voiced_flags, voiced_probs = librosa.pyin(
y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan)
voiced_f0 = f0[~np.isnan(f0)]
pitch_stability = 0.0
if voiced_f0.size > 0:
# Calculate coefficient of variation (lower is more stable)
cv = np.std(voiced_f0) / np.mean(voiced_f0) if np.mean(voiced_f0) > 0 else float('inf')
# Convert to score (0-100)
pitch_stability = max(0, min(100, 100 - (cv * 100)))
# Voice resonance - based on spectral bandwidth
bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
# Normalize (ideal range is around 1500-2500 Hz for speech)
if bandwidth < 1000:
resonance_score = max(0, bandwidth / 1000 * 70) # Too narrow
elif bandwidth <= 2500:
resonance_score = 70 + ((bandwidth - 1000) / 1500 * 30) # Optimal range
else:
resonance_score = max(0, 100 - ((bandwidth - 2500) / 2500 * 50)) # Too wide
# Voice strength - based on RMS energy
rms = np.mean(librosa.feature.rms(y=y))
# Normalize (typical speech RMS values range from 0.01 to 0.2)
strength_score = min(100, max(0, rms / 0.2 * 100))
# Combine additional metrics
additional_metrics = {
"pitch_stability": pitch_stability,
"voice_resonance": resonance_score,
"voice_strength": strength_score
}
# Add to results
combined_results = {
"VCS": vcs,
"insight": clarity_results["insight"],
"components": {
**clarity_results["components"],
**additional_metrics
},
"transcript": clarity_results["transcript"]
}
return combined_results
# Ensure the functions are exposed when imported
__all__ = ['compute_voice_clarity_score', 'analyze_voice_quality']