Spaces:

mulasagg
/

Voice

Sleeping

App Files Files Community

Voice / vcs /compute_vcs.py

mulasagg

first

8031a8f 3 months ago

raw

history blame contribute delete

4.03 kB

	"""
	Compute Voice Clarity Score from audio file
	"""

	import librosa
	import numpy as np
	from typing import Dict, Any
	from .vcs import calculate_voice_clarity_score, get_clarity_insight

	def compute_voice_clarity_score(file_path: str, whisper_model) -> Dict[str, Any]:
	"""
	Compute Voice Clarity Score and its components from a speech sample.

	Args:
	file_path (str): Path to the audio file.
	whisper_model: Transcription model (e.g., OpenAI Whisper or faster-whisper)

	Returns:
	dict: A dictionary containing Voice Clarity Score and component scores.
	"""
	# Transcribe audio
	result = whisper_model.transcribe(file_path)
	transcript = result.get("text", "").strip()
	segments = result.get("segments", [])

	# Validate early
	if not transcript or not segments:
	raise ValueError("Empty transcript or segments from Whisper.")

	# Load audio
	y, sr = librosa.load(file_path, sr=None)
	duration = len(y) / sr if sr else 0.0
	if duration <= 0:
	raise ValueError("Audio duration invalid or zero.")

	# Calculate Voice Clarity Score
	clarity_result = calculate_voice_clarity_score(y, sr, segments)

	# Add transcript to results
	clarity_result["transcript"] = transcript

	# Add word count and duration info for reference
	word_count = len(transcript.split())
	clarity_result["components"]["word_count"] = word_count
	clarity_result["components"]["duration"] = duration

	return clarity_result

	def analyze_voice_quality(file_path: str, whisper_model) -> Dict[str, Any]:
	"""
	Comprehensive voice quality analysis including clarity.

	Args:
	file_path (str): Path to the audio file
	whisper_model: Transcription model

	Returns:
	Dict[str, Any]: Complete voice quality analysis
	"""
	# Get Voice Clarity Score
	clarity_results = compute_voice_clarity_score(file_path, whisper_model)
	vcs = clarity_results["VCS"]

	# Load audio for additional analysis
	y, sr = librosa.load(file_path, sr=None)

	# Calculate additional voice quality metrics

	# Voice stability - based on pitch (F0) stability
	f0, voiced_flags, voiced_probs = librosa.pyin(
	y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan)
	voiced_f0 = f0[~np.isnan(f0)]

	pitch_stability = 0.0
	if voiced_f0.size > 0:
	# Calculate coefficient of variation (lower is more stable)
	cv = np.std(voiced_f0) / np.mean(voiced_f0) if np.mean(voiced_f0) > 0 else float('inf')
	# Convert to score (0-100)
	pitch_stability = max(0, min(100, 100 - (cv * 100)))

	# Voice resonance - based on spectral bandwidth
	bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
	# Normalize (ideal range is around 1500-2500 Hz for speech)
	if bandwidth < 1000:
	resonance_score = max(0, bandwidth / 1000 * 70) # Too narrow
	elif bandwidth <= 2500:
	resonance_score = 70 + ((bandwidth - 1000) / 1500 * 30) # Optimal range
	else:
	resonance_score = max(0, 100 - ((bandwidth - 2500) / 2500 * 50)) # Too wide

	# Voice strength - based on RMS energy
	rms = np.mean(librosa.feature.rms(y=y))
	# Normalize (typical speech RMS values range from 0.01 to 0.2)
	strength_score = min(100, max(0, rms / 0.2 * 100))

	# Combine additional metrics
	additional_metrics = {
	"pitch_stability": pitch_stability,
	"voice_resonance": resonance_score,
	"voice_strength": strength_score
	}

	# Add to results
	combined_results = {
	"VCS": vcs,
	"insight": clarity_results["insight"],
	"components": {
	**clarity_results["components"],
	**additional_metrics
	},
	"transcript": clarity_results["transcript"]
	}

	return combined_results

	# Ensure the functions are exposed when imported
	__all__ = ['compute_voice_clarity_score', 'analyze_voice_quality']