Spaces:

mulasagg
/

Voice

Sleeping

App Files Files Community

Voice / vps /compute_vps_score.py

mulasagg

first

8031a8f 3 months ago

raw

history blame contribute delete

2.6 kB

	from .vps import calculate_vps # Your file where calc_srs, calculate_pas, calculate_rcs, calculate_vps live
	import librosa
	import numpy as np
	import math
	from .filler_analyzer import detect_fillers

	def compute_vps_score(file_path: str, whisper_model) -> dict:
	"""
	Compute VPS (Voice Pacing Score) and its components from a speech sample.

	Args:
	file_path (str): Path to the audio file.
	whisper_model: Transcription model (e.g., OpenAI Whisper or faster-whisper)

	Returns:
	dict: A dictionary containing VPS, SRS, PAS, RCS, and component scores.
	"""
	# Transcribe
	result = whisper_model.transcribe(file_path)
	transcript = result.get("text", "").strip()
	segments = result.get("segments", [])

	# Validate early
	if not transcript or not segments:
	raise ValueError("Empty transcript or segments from Whisper.")

	# Filler count
	filler_count, _ = detect_fillers(transcript)

	# Load audio
	y, sr = librosa.load(file_path, sr=None)
	duration = len(y) / sr if sr else 0.0
	if duration <= 0:
	raise ValueError("Audio duration invalid or zero.")

	# Pitch variation (in semitones)
	f0, voiced_flags, voiced_probs = librosa.pyin(
	y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan)
	voiced_f0 = f0[~np.isnan(f0)]
	pitch_variation = 0.0
	if voiced_f0.size > 0:
	median_f0 = np.nanmedian(voiced_f0)
	median_f0 = max(median_f0, 1e-6)
	semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
	pitch_variation = float(np.nanstd(semitone_diffs))

	# Pause analysis
	long_pause_count = 0
	if segments:
	for i in range(len(segments) - 1):
	pause_dur = segments[i + 1]["start"] - segments[i]["end"]
	if pause_dur > 1.0:
	long_pause_count += 1
	# Beginning and end
	if segments[0]["start"] > 1.0:
	long_pause_count += 1
	if duration - segments[-1]["end"] > 1.0:
	long_pause_count += 1

	# WPM
	word_count = len(transcript.split())
	words_per_min = (word_count / duration) * 60.0 if duration > 0 else 0.0

	# Calculate VPS and components
	vps_result = calculate_vps(
	transcript=transcript,
	segments=segments,
	filler_count=filler_count,
	duration=duration,
	wpm=words_per_min,
	long_pause_count=long_pause_count,
	pitch_variation=pitch_variation,
	y=y,
	sr=sr
	)

	# Include transcript optionally
	vps_result["transcript"] = transcript
	return vps_result