Fast_api / vcs /compute_vcs.py
mulasagg's picture
Add application file
8ad2ab3
raw
history blame
4.03 kB
"""
Compute Voice Clarity Score from audio file
"""
import librosa
import numpy as np
from typing import Dict, Any
from .vcs import calculate_voice_clarity_score, get_clarity_insight
def compute_voice_clarity_score(file_path: str, whisper_model) -> Dict[str, Any]:
"""
Compute Voice Clarity Score and its components from a speech sample.
Args:
file_path (str): Path to the audio file.
whisper_model: Transcription model (e.g., OpenAI Whisper or faster-whisper)
Returns:
dict: A dictionary containing Voice Clarity Score and component scores.
"""
# Transcribe audio
result = whisper_model.transcribe(file_path)
transcript = result.get("text", "").strip()
segments = result.get("segments", [])
# Validate early
if not transcript or not segments:
raise ValueError("Empty transcript or segments from Whisper.")
# Load audio
y, sr = librosa.load(file_path, sr=None)
duration = len(y) / sr if sr else 0.0
if duration <= 0:
raise ValueError("Audio duration invalid or zero.")
# Calculate Voice Clarity Score
clarity_result = calculate_voice_clarity_score(y, sr, segments)
# Add transcript to results
clarity_result["transcript"] = transcript
# Add word count and duration info for reference
word_count = len(transcript.split())
clarity_result["components"]["word_count"] = word_count
clarity_result["components"]["duration"] = duration
return clarity_result
def analyze_voice_quality(file_path: str, whisper_model) -> Dict[str, Any]:
"""
Comprehensive voice quality analysis including clarity.
Args:
file_path (str): Path to the audio file
whisper_model: Transcription model
Returns:
Dict[str, Any]: Complete voice quality analysis
"""
# Get Voice Clarity Score
clarity_results = compute_voice_clarity_score(file_path, whisper_model)
vcs = clarity_results["VCS"]
# Load audio for additional analysis
y, sr = librosa.load(file_path, sr=None)
# Calculate additional voice quality metrics
# Voice stability - based on pitch (F0) stability
f0, voiced_flags, voiced_probs = librosa.pyin(
y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan)
voiced_f0 = f0[~np.isnan(f0)]
pitch_stability = 0.0
if voiced_f0.size > 0:
# Calculate coefficient of variation (lower is more stable)
cv = np.std(voiced_f0) / np.mean(voiced_f0) if np.mean(voiced_f0) > 0 else float('inf')
# Convert to score (0-100)
pitch_stability = max(0, min(100, 100 - (cv * 100)))
# Voice resonance - based on spectral bandwidth
bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
# Normalize (ideal range is around 1500-2500 Hz for speech)
if bandwidth < 1000:
resonance_score = max(0, bandwidth / 1000 * 70) # Too narrow
elif bandwidth <= 2500:
resonance_score = 70 + ((bandwidth - 1000) / 1500 * 30) # Optimal range
else:
resonance_score = max(0, 100 - ((bandwidth - 2500) / 2500 * 50)) # Too wide
# Voice strength - based on RMS energy
rms = np.mean(librosa.feature.rms(y=y))
# Normalize (typical speech RMS values range from 0.01 to 0.2)
strength_score = min(100, max(0, rms / 0.2 * 100))
# Combine additional metrics
additional_metrics = {
"pitch_stability": pitch_stability,
"voice_resonance": resonance_score,
"voice_strength": strength_score
}
# Add to results
combined_results = {
"VCS": vcs,
"insight": clarity_results["insight"],
"components": {
**clarity_results["components"],
**additional_metrics
},
"transcript": clarity_results["transcript"]
}
return combined_results
# Ensure the functions are exposed when imported
__all__ = ['compute_voice_clarity_score', 'analyze_voice_quality']