|
""" |
|
Compute Voice Clarity Score from audio file |
|
""" |
|
|
|
import librosa |
|
import numpy as np |
|
from typing import Dict, Any |
|
from .vcs import calculate_voice_clarity_score, get_clarity_insight |
|
|
|
def compute_voice_clarity_score(file_path: str, whisper_model) -> Dict[str, Any]: |
|
""" |
|
Compute Voice Clarity Score and its components from a speech sample. |
|
|
|
Args: |
|
file_path (str): Path to the audio file. |
|
whisper_model: Transcription model (e.g., OpenAI Whisper or faster-whisper) |
|
|
|
Returns: |
|
dict: A dictionary containing Voice Clarity Score and component scores. |
|
""" |
|
|
|
result = whisper_model.transcribe(file_path) |
|
transcript = result.get("text", "").strip() |
|
segments = result.get("segments", []) |
|
|
|
|
|
if not transcript or not segments: |
|
raise ValueError("Empty transcript or segments from Whisper.") |
|
|
|
|
|
y, sr = librosa.load(file_path, sr=None) |
|
duration = len(y) / sr if sr else 0.0 |
|
if duration <= 0: |
|
raise ValueError("Audio duration invalid or zero.") |
|
|
|
|
|
clarity_result = calculate_voice_clarity_score(y, sr, segments) |
|
|
|
|
|
clarity_result["transcript"] = transcript |
|
|
|
|
|
word_count = len(transcript.split()) |
|
clarity_result["components"]["word_count"] = word_count |
|
clarity_result["components"]["duration"] = duration |
|
|
|
return clarity_result |
|
|
|
def analyze_voice_quality(file_path: str, whisper_model) -> Dict[str, Any]: |
|
""" |
|
Comprehensive voice quality analysis including clarity. |
|
|
|
Args: |
|
file_path (str): Path to the audio file |
|
whisper_model: Transcription model |
|
|
|
Returns: |
|
Dict[str, Any]: Complete voice quality analysis |
|
""" |
|
|
|
clarity_results = compute_voice_clarity_score(file_path, whisper_model) |
|
vcs = clarity_results["VCS"] |
|
|
|
|
|
y, sr = librosa.load(file_path, sr=None) |
|
|
|
|
|
|
|
|
|
f0, voiced_flags, voiced_probs = librosa.pyin( |
|
y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan) |
|
voiced_f0 = f0[~np.isnan(f0)] |
|
|
|
pitch_stability = 0.0 |
|
if voiced_f0.size > 0: |
|
|
|
cv = np.std(voiced_f0) / np.mean(voiced_f0) if np.mean(voiced_f0) > 0 else float('inf') |
|
|
|
pitch_stability = max(0, min(100, 100 - (cv * 100))) |
|
|
|
|
|
bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr)) |
|
|
|
if bandwidth < 1000: |
|
resonance_score = max(0, bandwidth / 1000 * 70) |
|
elif bandwidth <= 2500: |
|
resonance_score = 70 + ((bandwidth - 1000) / 1500 * 30) |
|
else: |
|
resonance_score = max(0, 100 - ((bandwidth - 2500) / 2500 * 50)) |
|
|
|
|
|
rms = np.mean(librosa.feature.rms(y=y)) |
|
|
|
strength_score = min(100, max(0, rms / 0.2 * 100)) |
|
|
|
|
|
additional_metrics = { |
|
"pitch_stability": pitch_stability, |
|
"voice_resonance": resonance_score, |
|
"voice_strength": strength_score |
|
} |
|
|
|
|
|
combined_results = { |
|
"VCS": vcs, |
|
"insight": clarity_results["insight"], |
|
"components": { |
|
**clarity_results["components"], |
|
**additional_metrics |
|
}, |
|
"transcript": clarity_results["transcript"] |
|
} |
|
|
|
return combined_results |
|
|
|
|
|
__all__ = ['compute_voice_clarity_score', 'analyze_voice_quality'] |