Fast_api / fluency /compute_fluency.py
mulasagg's picture
Add application file
8ad2ab3
raw
history blame
3.5 kB
"""
Compute fluency score from audio file using SRS and PAS calculations
"""
import librosa
import numpy as np
from typing import Dict, Any, Union
from .fluency import calc_srs, calculate_pas, calculate_fluency, get_fluency_insight
from .filler_analyzer import detect_fillers
def compute_fluency_score(file_path: str, whisper_model) -> Dict[str, Any]:
"""
Compute fluency score and its components from a speech sample.
Args:
file_path (str): Path to the audio file.
whisper_model: Transcription model (e.g., OpenAI Whisper or faster-whisper)
Returns:
dict: A dictionary containing fluency score, SRS, PAS, and component scores.
"""
# Transcribe audio
result = whisper_model.transcribe(file_path)
transcript = result.get("text", "").strip()
segments = result.get("segments", [])
# Validate early
if not transcript or not segments:
raise ValueError("Empty transcript or segments from Whisper.")
# Detect filler words
filler_count, _ = detect_fillers(transcript)
# Load audio
y, sr = librosa.load(file_path, sr=None)
duration = len(y) / sr if sr else 0.0
if duration <= 0:
raise ValueError("Audio duration invalid or zero.")
# Calculate pitch variation (in semitones)
f0, voiced_flags, voiced_probs = librosa.pyin(
y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan)
voiced_f0 = f0[~np.isnan(f0)]
pitch_variation = 0.0
if voiced_f0.size > 0:
median_f0 = np.nanmedian(voiced_f0)
median_f0 = max(median_f0, 1e-6)
semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
pitch_variation = float(np.nanstd(semitone_diffs))
# Analyze pauses
long_pause_count = 0
if segments:
for i in range(len(segments) - 1):
pause_dur = segments[i + 1]["start"] - segments[i]["end"]
if pause_dur > 1.0:
long_pause_count += 1
# Check beginning and end pauses
if segments[0]["start"] > 1.0:
long_pause_count += 1
if duration - segments[-1]["end"] > 1.0:
long_pause_count += 1
# Calculate WPM
word_count = len(transcript.split())
words_per_min = (word_count / duration) * 60.0 if duration > 0 else 0.0
# Calculate SRS - Speech Rate Stability
srs_score = calc_srs(
wpm=words_per_min,
filler_count=filler_count,
long_pause_count=long_pause_count,
pitch_variation=pitch_variation
)
# Calculate PAS - Pause Appropriateness Score
pas_result = calculate_pas(
transcript=transcript,
segments=segments,
filler_count=filler_count,
duration=duration
)
pas_score = pas_result["PAS"]
# Calculate final fluency score
fluency_result = calculate_fluency(srs=srs_score, pas=pas_score)
fluency_score = fluency_result["score"]
insight = get_fluency_insight(fluency_score)
# Build and return comprehensive result
return {
"fluency_score": fluency_score,
"insight": insight,
"SRS": srs_score,
"PAS": pas_score,
"components": {
"wpm": words_per_min,
"filler_count": filler_count,
"long_pause_count": long_pause_count,
"pitch_variation": pitch_variation,
"word_count": word_count,
"duration": duration,
"pas_components": pas_result
},
"transcript": transcript
}