diff --git a/README.md b/README.md index bf438ef020e1a7f1ef32ea6a30613f865fb271b0..a2425ffff9dcda4b48915a50d3b725f38220471b 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ --- -title: Fast Api -emoji: 💻 -colorFrom: blue -colorTo: purple +title: Voice Deploy +emoji: 🏢 +colorFrom: green +colorTo: gray sdk: docker pinned: false license: mit diff --git a/filler_count/__init__.py b/filler_count/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/filler_count/__pycache__/__init__.cpython-312.pyc b/filler_count/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..25b7a301e74978c0e6506a75cf1f94e03b9c5a50 Binary files /dev/null and b/filler_count/__pycache__/__init__.cpython-312.pyc differ diff --git a/filler_count/__pycache__/filler_score.cpython-312.pyc b/filler_count/__pycache__/filler_score.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..342c09cc30d9ae79049c3790259eef572597c277 Binary files /dev/null and b/filler_count/__pycache__/filler_score.cpython-312.pyc differ diff --git a/filler_count/filler_score.py b/filler_count/filler_score.py new file mode 100644 index 0000000000000000000000000000000000000000..ae208fcfedb5b453d58f877b0341c44dbd97f6dc --- /dev/null +++ b/filler_count/filler_score.py @@ -0,0 +1,24 @@ +import re +import whisper + +def analyze_fillers(file_path: str, model_size: str = "base") -> dict: + try: + FILLER_WORDS = ["um", "uh", "hmm", "ah", "er", "eh", "like", "you know", "well"] + + model = whisper.load_model(model_size) + result = model.transcribe(file_path, word_timestamps=False, fp16=False) + transcript = result["text"] + + pattern = r"\b(" + "|".join(FILLER_WORDS) + r")\b" + matches = re.findall(pattern, transcript.lower()) + + filler_counts = {filler: matches.count(filler) for filler in FILLER_WORDS} + total_fillers = sum(filler_counts.values()) + + return { + # "transcript": transcript, + "filler_counts": {k: v for k, v in filler_counts.items() if v > 0}, + "total_fillers": total_fillers + } + except Exception as e: + raise RuntimeError(f"Error during analysis: {str(e)}") \ No newline at end of file diff --git a/fluency/__init__.py b/fluency/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fb4f112a7786b5d2f56ec5804e2ec32443669823 --- /dev/null +++ b/fluency/__init__.py @@ -0,0 +1,13 @@ +# fluency/__init__.py +from .fluency import calc_srs, calculate_pas, calculate_fluency, get_fluency_insight +from .filler_analyzer import detect_fillers +from .compute_fluency import compute_fluency_score + +__all__ = [ + 'calc_srs', + 'calculate_pas', + 'calculate_fluency', + 'get_fluency_insight', + 'detect_fillers', + 'compute_fluency_score' +] \ No newline at end of file diff --git a/fluency/__pycache__/__init__.cpython-312.pyc b/fluency/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7e6e73030145775aeaad1a0a0f1d524fc00f8d56 Binary files /dev/null and b/fluency/__pycache__/__init__.cpython-312.pyc differ diff --git a/fluency/__pycache__/compute_fluency.cpython-312.pyc b/fluency/__pycache__/compute_fluency.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..38ced3a7fca80d7c83c6a4193da1d9cc8a95aae4 Binary files /dev/null and b/fluency/__pycache__/compute_fluency.cpython-312.pyc differ diff --git a/fluency/__pycache__/filler_analyzer.cpython-312.pyc b/fluency/__pycache__/filler_analyzer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..916bd6bbf6ef2a033b546e511061165d10eb1ba6 Binary files /dev/null and b/fluency/__pycache__/filler_analyzer.cpython-312.pyc differ diff --git a/fluency/__pycache__/fluency.cpython-312.pyc b/fluency/__pycache__/fluency.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6e47449770f2e2428c6e0fd46ddb7ef3ee6cb385 Binary files /dev/null and b/fluency/__pycache__/fluency.cpython-312.pyc differ diff --git a/fluency/__pycache__/fluency_api.cpython-312.pyc b/fluency/__pycache__/fluency_api.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..29d1a2e8c4c9c227d6c725029f9f6f45ef7658b7 Binary files /dev/null and b/fluency/__pycache__/fluency_api.cpython-312.pyc differ diff --git a/fluency/__pycache__/main.cpython-312.pyc b/fluency/__pycache__/main.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..805180b9205c2030775471214c226de9ca5c8a4b Binary files /dev/null and b/fluency/__pycache__/main.cpython-312.pyc differ diff --git a/fluency/compute_fluency.py b/fluency/compute_fluency.py new file mode 100644 index 0000000000000000000000000000000000000000..52fe6f50986adec73909bfe58e2e0758ed76ffcd --- /dev/null +++ b/fluency/compute_fluency.py @@ -0,0 +1,106 @@ +""" +Compute fluency score from audio file using SRS and PAS calculations +""" + +import librosa +import numpy as np +from typing import Dict, Any, Union +from .fluency import calc_srs, calculate_pas, calculate_fluency, get_fluency_insight +from .filler_analyzer import detect_fillers + +def compute_fluency_score(file_path: str, whisper_model) -> Dict[str, Any]: + """ + Compute fluency score and its components from a speech sample. + + Args: + file_path (str): Path to the audio file. + whisper_model: Transcription model (e.g., OpenAI Whisper or faster-whisper) + + Returns: + dict: A dictionary containing fluency score, SRS, PAS, and component scores. + """ + # Transcribe audio + result = whisper_model.transcribe(file_path) + transcript = result.get("text", "").strip() + segments = result.get("segments", []) + + # Validate early + if not transcript or not segments: + raise ValueError("Empty transcript or segments from Whisper.") + + # Detect filler words + filler_count, _ = detect_fillers(transcript) + + # Load audio + y, sr = librosa.load(file_path, sr=None) + duration = len(y) / sr if sr else 0.0 + if duration <= 0: + raise ValueError("Audio duration invalid or zero.") + + # Calculate pitch variation (in semitones) + f0, voiced_flags, voiced_probs = librosa.pyin( + y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan) + voiced_f0 = f0[~np.isnan(f0)] + pitch_variation = 0.0 + if voiced_f0.size > 0: + median_f0 = np.nanmedian(voiced_f0) + median_f0 = max(median_f0, 1e-6) + semitone_diffs = 12 * np.log2(voiced_f0 / median_f0) + pitch_variation = float(np.nanstd(semitone_diffs)) + + # Analyze pauses + long_pause_count = 0 + if segments: + for i in range(len(segments) - 1): + pause_dur = segments[i + 1]["start"] - segments[i]["end"] + if pause_dur > 1.0: + long_pause_count += 1 + # Check beginning and end pauses + if segments[0]["start"] > 1.0: + long_pause_count += 1 + if duration - segments[-1]["end"] > 1.0: + long_pause_count += 1 + + # Calculate WPM + word_count = len(transcript.split()) + words_per_min = (word_count / duration) * 60.0 if duration > 0 else 0.0 + + # Calculate SRS - Speech Rate Stability + srs_score = calc_srs( + wpm=words_per_min, + filler_count=filler_count, + long_pause_count=long_pause_count, + pitch_variation=pitch_variation + ) + + # Calculate PAS - Pause Appropriateness Score + pas_result = calculate_pas( + transcript=transcript, + segments=segments, + filler_count=filler_count, + duration=duration + ) + pas_score = pas_result["PAS"] + + # Calculate final fluency score + fluency_result = calculate_fluency(srs=srs_score, pas=pas_score) + fluency_score = fluency_result["score"] + insight = get_fluency_insight(fluency_score) + + # Build and return comprehensive result + return { + "fluency_score": fluency_score, + "insight": insight, + "SRS": srs_score, + "PAS": pas_score, + "components": { + "wpm": words_per_min, + "filler_count": filler_count, + "long_pause_count": long_pause_count, + "pitch_variation": pitch_variation, + "word_count": word_count, + "duration": duration, + "pas_components": pas_result + }, + "transcript": transcript + } \ No newline at end of file diff --git a/fluency/filler_analyzer.py b/fluency/filler_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..090853dcbea679429376dbc44ace524e671baf61 --- /dev/null +++ b/fluency/filler_analyzer.py @@ -0,0 +1,100 @@ +# Define filler words for English, Hindi, Tamil (in both Latin and native scripts) +# Mapping each variant to a common label (usually the Latin script for insight reporting) +FILLER_VARIANTS = { + # English fillers + "um": "um", "uh": "uh", "hmm": "hmm", "ah": "ah", "er": "er", + "umm": "um", "uhh": "uh", "mmm": "hmm", + "like": "like", "you know": "you know", "so": "so", "well": "well", + # Hindi fillers (Devanagari and transliteration) + "मतलब": "matlab", "matlab": "matlab", + "क्या कहते हैं": "kya kehte hain", "kya kehte hain": "kya kehte hain", + "वो ना": "wo na", "woh na": "wo na", "wo na": "wo na", + "ऐसा है": "aisa hai", "aisa hai": "aisa hai", + "हाँ": "haan", "haan": "haan", "हा": "haan", # "हा" might appear as a shorter "haan" + "अच्छा": "acha", "acha": "acha", + # Tamil fillers (Tamil script and transliteration) + "பாத்தீங்கனா": "paatheenga-na", "paatheenga na": "paatheenga-na", "paatheenga-na": "paatheenga-na", + "அப்பரம்": "apparam", "apparam": "apparam", + "என்ன": "enna", "enna": "enna" +} + +def detect_fillers(transcript): + """ + Detects filler words in the transcript. + + Args: + transcript: Full transcript text + + Returns: + tuple: (filler_count, filler_occurrences) + """ + transcript_lower = transcript.lower() + filler_count = 0 + # Track which specific fillers were used (for insight examples) + filler_occurrences = {} + + for variant, label in FILLER_VARIANTS.items(): + if variant in transcript_lower: + count = transcript_lower.count(variant) + if count > 0: + filler_count += count + # Accumulate count for the normalized label + filler_occurrences[label] = filler_occurrences.get(label, 0) + count + + return filler_count, filler_occurrences + +def analyze_filler_words(filler_count, filler_occurrences, duration): + """ + Analyzes filler word usage in speech. + + Args: + filler_count: Total count of filler words + filler_occurrences: Dictionary of specific filler words and their counts + duration: Duration of the audio in seconds + + Returns: + dict: Contains the filler words score and insight text + """ + # Extract top examples for insights + filler_examples = [] + if filler_occurrences: + # Sort by frequency + sorted_fillers = sorted(filler_occurrences.items(), key=lambda x: x[1], reverse=True) + for label, count in sorted_fillers[:2]: + filler_examples.append(label) + + # Compute fillers per minute as a gauge + filler_per_min = (filler_count / duration) * 60.0 if duration > 0 else 0.0 + + if filler_count == 0: + filler_score = 10 + elif filler_per_min < 1: + filler_score = 9 + elif filler_per_min < 3: + filler_score = 8 + elif filler_per_min < 5: + filler_score = 6 + elif filler_per_min < 10: + filler_score = 4 + else: + filler_score = 2 + + filler_score = max(0, filler_score) + + # Generate insight text based on the score and examples + if filler_count == 0: + insight = "No filler words (um, ah, etc.) were detected, keeping the speech very clear." + elif filler_count <= 2: + example = filler_examples[0] if filler_examples else "um" + insight = f"Only a couple of filler words (e.g., '{example}') were used, which had minimal impact." + elif filler_count <= 5: + examples = ", ".join(f"'{ex}'" for ex in filler_examples) if filler_examples else "filler words" + insight = f"Some filler words {examples} were used occasionally; reducing them could improve clarity." + else: + examples = ", ".join(f"'{ex}'" for ex in filler_examples) if filler_examples else "'um'" + insight = f"Frequent filler words such as {examples} were detected, which can distract the audience and suggest uncertainty." + + return { + "score": int(filler_score), + "insight": insight + } \ No newline at end of file diff --git a/fluency/fluency.py b/fluency/fluency.py new file mode 100644 index 0000000000000000000000000000000000000000..c4677068bee6dadf2cc9b7904cd40f4d51489178 --- /dev/null +++ b/fluency/fluency.py @@ -0,0 +1,149 @@ + + +import spacy +from typing import List, Dict + +def calc_srs(wpm, filler_count, long_pause_count, pitch_variation): + """ + Speech Rate Stability (SRS): Reflects the consistency of the speaker's pace and rhythm. + + Args: + wpm (float): Words per minute + filler_count (int): Number of filler words ("um", "uh", etc.) + long_pause_count (int): Number of pauses longer than 1 second + pitch_variation (float): Standard deviation of pitch in semitones + + Returns: + float: SRS score between 0-100 + + Requires: + - Words per Minute Consistency: Regularity in speech speed. + - Absence of Sudden Speed Shifts: Smooth transitions without erratic tempo changes. + """ + ideal_wpm = 150 + wpm_deviation = min(30, abs(wpm - ideal_wpm)) # Cap at 30 WPM deviation + wpm_consistency = max(0, 100 - (wpm_deviation * 1.67)) # 100-50 for max deviation + + # Sudden Speech Shift Penalty + filler_penalty = min(filler_count / 10, 1.0) + pause_penalty = min(long_pause_count / 5, 1.0) + pitch_penalty = min(pitch_variation / 3.0, 1.0) # High variation → unstable + + # Combine into absence of sudden shifts + stability = (1 - ((filler_penalty + pause_penalty + pitch_penalty) / 3)) * 100 + + # Final SRS Score + SRS = (0.45 * wpm_consistency) + (0.55 * stability) + return min(100, max(0, SRS)) + + +def calculate_pas(transcript: str, segments: List[Dict], filler_count: int, duration: float) -> Dict[str, float]: + """ + Calculate the Pause Appropriateness Score (PAS) and its components. + + Args: + transcript (str): Full transcript text + segments (List[Dict]): List of transcript segments with start/end times + filler_count (int): Number of filler words detected + duration (float): Total duration of audio in seconds + + Returns: + Dict[str, float]: Dictionary with NPP, AFW, and PAS scores + """ + if not transcript or not segments or duration <= 0: + raise ValueError("Transcript, segments, and duration must be valid") + + nlp = spacy.load("en_core_web_sm") + doc = nlp(transcript) + + words = transcript.split() + total_words = len(words) + if total_words == 0: + raise ValueError("No words found in transcript") + + # Calculate Avoidance of Filler Words (AFW) + filler_rate = filler_count / total_words if total_words > 0 else 0.0 + if filler_rate >= 0.10: + afw = 0.0 + elif filler_rate <= 0.0: + afw = 100.0 + else: + afw = 100.0 - (filler_rate * 1000) + afw = max(0.0, min(100.0, afw)) + + # Calculate Natural Pause Placement (NPP) + total_pauses = 0 + natural_pauses = 0 + segment_texts = [seg["text"].strip() for seg in segments] + segment_starts = [seg["start"] for seg in segments] + segment_ends = [seg["end"] for seg in segments] + + for i in range(len(segments) - 1): + pause_dur = segment_starts[i + 1] - segment_ends[i] + if pause_dur > 0.5: + total_pauses += 1 + if segment_texts[i] and segment_texts[i][-1] in ".!?,": + natural_pauses += 1 + + # Check initial and final pauses + if segment_starts[0] > 0.5: + total_pauses += 1 + if duration - segment_ends[-1] > 0.5: + total_pauses += 1 + if segment_texts[-1] and segment_texts[-1][-1] in ".!?": + natural_pauses += 1 + + npp = 100.0 if total_pauses == 0 else (natural_pauses / total_pauses) * 100.0 + + # Calculate final PAS + pas = (0.4 * npp) + (0.6 * afw) + + return { + "NPP": npp, + "AFW": afw, + "PAS": pas + } + + +def calculate_fluency(srs: float, pas: float) -> Dict[str, float]: + """ + Calculate fluency score based on Speech Rate Stability and Pause Appropriateness Score. + + Args: + srs (float): Speech Rate Stability score (0-100) + pas (float): Pause Appropriateness Score (0-100) + + Returns: + Dict[str, float]: Dictionary with fluency score (0-100) and component contributions + """ + # Equal weighting of SRS and PAS for fluency + fluency_score = (0.5 * srs) + (0.5 * pas) + + + return { + "score": fluency_score, + "SRS_contribution": 0.5 * srs, + "PAS_contribution": 0.5 * pas + } + + +def get_fluency_insight(fluency_score: float) -> str: + """ + Generate insight text based on the fluency score. + + Args: + fluency_score (float): The calculated fluency score (0-100) + + Returns: + str: Insight text explaining the score + """ + if fluency_score >= 85: + return "Excellent fluency with very consistent pacing and natural pauses. Speech flows effortlessly." + elif fluency_score >= 70: + return "Good fluency with generally stable speech rate and appropriate pauses. Some minor inconsistencies." + elif fluency_score >= 50: + return "Moderate fluency with occasional disruptions in speech flow. Consider working on pace stability and pause placement." + elif fluency_score >= 30: + return "Below average fluency with noticeable disruptions. Focus on reducing filler words and maintaining consistent pace." + else: + return "Speech fluency needs significant improvement. Work on maintaining consistent pace, reducing long pauses, and eliminating filler words." \ No newline at end of file diff --git a/fluency/fluency_api.py b/fluency/fluency_api.py new file mode 100644 index 0000000000000000000000000000000000000000..2f4a16314a616b8af2efa6761feabe12a63005c5 --- /dev/null +++ b/fluency/fluency_api.py @@ -0,0 +1,22 @@ +import whisper +from .compute_fluency import compute_fluency_score + +def main(file_path: str, model_size: str = "base") -> dict: + try: + + whisper_model = whisper.load_model(model_size) + + results = compute_fluency_score(file_path, whisper_model) + + # Structure response + response = { + "fluency_score": round(results['fluency_score'], 2) + # "insight": results["insight"], + # "SRS": round(results["SRS"], 2), + # "PAS": round(results["PAS"], 2), + # "transcript": results["transcript"] + } + return response + + except Exception as e: + raise RuntimeError(f"Error during analysis: {str(e)}") diff --git a/fluency/main.py b/fluency/main.py new file mode 100644 index 0000000000000000000000000000000000000000..99a220d8c8214f19465ce1b13778b6a2ed067be4 --- /dev/null +++ b/fluency/main.py @@ -0,0 +1,49 @@ +import json +import whisper +from .compute_fluency import compute_fluency_score + +def main(): + """ + Main function to run fluency analysis on audio files + """ + # Fixed parameters - modify these values directly in the code + audio_file = r"D:\Intern\shankh\audio_samples\obama_short.wav" # Path to your audio file + model_size = "base" # Whisper model size (tiny, base, small, medium, large) + verbose = True # Whether to print detailed results + + try: + # Load whisper model + print(f"Loading Whisper model ({model_size})...") + whisper_model = whisper.load_model(model_size) + + # Calculate fluency score + print(f"Analyzing fluency for {audio_file}...") + results = compute_fluency_score(audio_file, whisper_model) + + # Print summary results + print("\nFluency Analysis Results:") + print(f"- Fluency Score: {results['fluency_score']:.2f}/100") + print(f"- Insight: {results['insight']}") + print(f"- Speech Rate Stability (SRS): {results['SRS']:.2f}/100") + print(f"- Pause Appropriateness (PAS): {results['PAS']:.2f}/100") + + # Print verbose results if enabled + if verbose: + print("\nDetailed Metrics:") + print(f"- Words per minute: {results['components']['wpm']:.1f}") + print(f"- Filler word count: {results['components']['filler_count']}") + print(f"- Long pauses: {results['components']['long_pause_count']}") + print(f"- Pitch variation: {results['components']['pitch_variation']:.2f} semitones") + print(f"- Natural Pause Placement: {results['components']['pas_components']['NPP']:.2f}/100") + print(f"- Avoidance of Filler Words: {results['components']['pas_components']['AFW']:.2f}/100") + + # Print first 100 characters of transcript + transcript_preview = results['transcript'][:] + "..." if len(results['transcript']) > 100 else results['transcript'] + print(f"\nTranscript preview: {transcript_preview}") + + except Exception as e: + print(f"Error during analysis: {str(e)}") + return 1 + +if __name__ == "__main__": + exit(main()) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 97dc7cd8c1fd2f07d8ec79a1117664c5ebaf2842..ecac929cc15b933ccb3cf49975a35ebd0695ce5d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,21 @@ + fastapi uvicorn +python-multipart + + +librosa +soundfile +pyworld +scipy + + +openai-whisper==20240930 +spacy==3.8.5 +en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl + + + +numpy +tqdm +requests \ No newline at end of file diff --git a/tone_modulation/__init__.py b/tone_modulation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tone_modulation/__pycache__/__init__.cpython-312.pyc b/tone_modulation/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..595162a2e38a9ee3e8ef1739f3f99258be7bc64e Binary files /dev/null and b/tone_modulation/__pycache__/__init__.cpython-312.pyc differ diff --git a/tone_modulation/__pycache__/sds.cpython-312.pyc b/tone_modulation/__pycache__/sds.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c66263bb21c8cddfe6a11b80b3126b9a302ed819 Binary files /dev/null and b/tone_modulation/__pycache__/sds.cpython-312.pyc differ diff --git a/tone_modulation/__pycache__/tone_api.cpython-312.pyc b/tone_modulation/__pycache__/tone_api.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..57e50da06ffcd75eb1333f71852b4e1858f1ff7f Binary files /dev/null and b/tone_modulation/__pycache__/tone_api.cpython-312.pyc differ diff --git a/tone_modulation/sds.py b/tone_modulation/sds.py new file mode 100644 index 0000000000000000000000000000000000000000..a4166ed8b27f4f38b14a012ddbbd1c6e25c4fbff --- /dev/null +++ b/tone_modulation/sds.py @@ -0,0 +1,385 @@ + +import scipy.signal +import numpy as np +import librosa +import pyworld as pw + +# def compute_pitch_variation(file_path): +# # Step 1: Load audio +# y, sr = librosa.load(file_path, sr=None) +# y = y.astype(np.float64) # pyworld expects float64 + +# # Step 2: Extract pitch (F0) +# _f0, t = pw.dio(y, sr) # Fast initial pitch estimation +# f0 = pw.stonemask(y, _f0, t, sr) # Refinement step + +# # Step 3: Filter voiced frames +# voiced_f0 = f0[f0 > 0] + +# # Handle empty case +# if voiced_f0.size == 0: +# return { +# "pitch_mean": 0.0, +# "pitch_std": 0.0, +# "pitch_range": 0.0, +# "semitone_std": 0.0, +# "pitch_variation_score": 0.0 +# } + +# # Step 4: Basic statistics +# pitch_mean = np.mean(voiced_f0) +# pitch_std = np.std(voiced_f0) +# pitch_range = np.max(voiced_f0) - np.min(voiced_f0) + +# print(pitch_mean) +# print(f'voiced_f0: {voiced_f0}') +# # Step 5: Compute semitone-based variation (better for human perception) +# median_f0 = np.median(voiced_f0) +# if median_f0 <= 0: +# median_f0 = 1e-6 # Avoid division by zero + +# semitone_diffs = 12 * np.log2(voiced_f0 / median_f0) +# semitone_std = np.std(semitone_diffs) +# print(semitone_std) + +# # Step 6: Scale semitone_std to a 0–100 score (tunable) +# # For example: semitone_std of 0 → 0 score, ≥6 semitones → 100 score +# pitch_variation_score = np.clip((semitone_std / 6.0) * 100, 0, 100) + +# return { +# "pitch_mean": pitch_mean, +# "pitch_std": pitch_std, +# "pitch_range": pitch_range, +# "semitone_std": semitone_std, +# "pitch_variation_score": pitch_variation_score +# } +# def compute_intonation_range(file_path): +# # Step 1: Load and prepare audio +# y, sr = librosa.load(file_path, sr=None) +# y = y.astype(np.float64) + +# # Step 2: Extract F0 +# _f0, t = pw.dio(y, sr) +# f0 = pw.stonemask(y, _f0, t, sr) + + + +# # Step 3: Filter voiced frames +# voiced_f0 = f0[f0 > 0] +# if voiced_f0.size == 0: +# return 0.0 + +# voiced_f0 = voiced_f0[(voiced_f0 > np.percentile(voiced_f0, 5)) & +# (voiced_f0 < np.percentile(voiced_f0, 95))] + +# # Step 4: Compute intonation range (in semitones) +# f0_min = np.min(voiced_f0) +# f0_max = np.max(voiced_f0) +# if f0_min <= 0: +# f0_min = 1e-6 # to avoid log error +# intonation_range = 12 * np.log2(f0_max / f0_min) + +# # range into scores: + +# max_range = 12.0 +# normalized = min(intonation_range, max_range) / max_range +# score = normalized * 100 +# return round(score, 2), intonation_range + + + +# def compute_pitch_variation(file_path): +# # Step 1: Load audio +# y, sr = librosa.load(file_path, sr=None) + +# # Step 2: Extract pitch using librosa.pyin (YIN-based) +# f0, voiced_flags, voiced_probs = librosa.pyin( +# y, +# sr=sr, +# fmin=80, +# fmax=400, +# frame_length=1105, +# hop_length=256, +# fill_na=np.nan +# ) + +# # Step 3: Filter voiced frames +# voiced_f0 = f0[~np.isnan(f0)] + + +# voiced_f0 = voiced_f0[ +# (voiced_f0 > np.percentile(voiced_f0, 5)) & +# (voiced_f0 < np.percentile(voiced_f0, 95)) +# ] + +# # Handle empty case +# if voiced_f0.size == 0: +# return { +# "pitch_mean": 0.0, +# "pitch_std": 0.0, +# "pitch_range": 0.0, +# "semitone_std": 0.0, +# "pitch_variation_score": 0.0 +# } + +# # Step 4: Basic statistics +# pitch_mean = float(np.mean(voiced_f0)) +# pitch_std = float(np.std(voiced_f0)) +# pitch_range = float(np.max(voiced_f0) - np.min(voiced_f0)) + + +# # Step 5: Compute semitone-based variation +# median_f0 = np.median(voiced_f0) +# if median_f0 <= 0: +# median_f0 = 1e-6 + +# semitone_diffs = 12 * np.log2(voiced_f0 / median_f0) +# semitone_std = float(np.std(semitone_diffs)) + + +# # Step 6: Scale to 0–100 score +# pitch_variation_score = float(np.clip((semitone_std / 6.0) * 100, 0, 100)) +# return { +# "pitch_mean": pitch_mean, +# "pitch_std": pitch_std, +# "pitch_range": pitch_range, +# "semitone_std": semitone_std, +# "pitch_variation_score": pitch_variation_score +# } + +# def compute_intonation_range(file_path): +# # Step 1: Load and prepare audio +# y, sr = librosa.load(file_path, sr=None) + +# # Step 2: Extract F0 using librosa.pyin +# f0, voiced_flags, voiced_probs = librosa.pyin( +# y, +# sr=sr, +# fmin=80, +# fmax=400, +# frame_length=1105, # ensures two periods of fmin fit +# hop_length=256, +# fill_na=np.nan +# ) + +# # Step 3: Filter voiced frames +# voiced_f0 = f0[~np.isnan(f0)] +# if voiced_f0.size == 0: +# return 0.0, 0.0 + +# # Optional: remove outliers (5th to 95th percentile) +# voiced_f0 = voiced_f0[ +# (voiced_f0 > np.percentile(voiced_f0, 5)) & +# (voiced_f0 < np.percentile(voiced_f0, 95)) +# ] + +# # Step 4: Compute intonation range in semitones +# f0_min = np.min(voiced_f0) +# f0_max = np.max(voiced_f0) +# if f0_min <= 0: +# f0_min = 1e-6 + +# intonation_range = 12 * np.log2(f0_max / f0_min) + +# # Step 5: Normalize and convert to score out of 100 +# max_range = 12.0 # ~1 octave +# normalized = min(intonation_range, max_range) / max_range +# score = normalized * 100 + +# return round(score, 2), float(intonation_range) + + + +# def compute_speech_rhythm_variability(file_path): +# """ +# Computes the speech rhythm variability score from an audio file. +# The method estimates tempo consistency across time using onset intervals. + +# Returns: +# score (float): Normalized rhythm variability score out of 100. +# raw_std (float): Raw standard deviation of inter-onset intervals. +# """ +# # Step 1: Load audio +# y, sr = librosa.load(file_path, sr=None) + +# # Step 2: Onset detection +# onset_env = librosa.onset.onset_strength(y=y, sr=sr) +# onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time') + +# if len(onsets) < 2: +# return 0.0, 0.0 # Not enough onsets to compute rhythm + +# # Step 3: Compute inter-onset intervals (IOIs) as rhythm proxy +# iois = np.diff(onsets) + +# # Optional: Remove outliers (5th–95th percentile) +# ioi_clean = iois[(iois > np.percentile(iois, 5)) & (iois < np.percentile(iois, 95))] +# if len(ioi_clean) < 2: +# return 0.0, 0.0 + +# # Step 4: Compute variability — standard deviation of IOIs +# raw_std = np.std(ioi_clean) + +# # Step 5: Normalize raw_std to 0–100 score +# # Lower std = more consistent rhythm → higher score +# min_std = 0.05 # near-perfect rhythm (tight pacing) +# max_std = 0.6 # highly irregular rhythm + +# # Clamp and reverse-score +# clamped_std = np.clip(raw_std, min_std, max_std) +# normalized = 1 - (clamped_std - min_std) / (max_std - min_std) +# score = normalized * 100 + +# return round(score, 2), round(float(raw_std), 4) + + +# def calc_sds(file_path): + +# # sds = 0.35 * pitch_variation + 0.35 * intonation_range + 0.3 * speech_rhythm_variability + +# pitch_variation = compute_pitch_variation(file_path) +# intonation_range = compute_intonation_range(file_path) +# speech_rhythm_variability = compute_speech_rhythm_variability(file_path) +# # print(f"Speech Rhythm Variability Score: {speech_rhythm_variability}") +# # print(f"Speech Rhythm Variability Score: {speech_rhythm_variability}") +# # print(f"Speech Rhythm Variability Score: {speech_rhythm_variability}") + +# sds = 0.35 * pitch_variation['pitch_variation_score'] + 0.35 * intonation_range[0] + 0.3 * speech_rhythm_variability[0] +# return round(sds, 2) + +# path = r'D:\Intern\shankh\audio_samples\anga.wav' + +# result = calc_sds(path) +# print(f"SDS: {result}") + +import numpy as np +import librosa +import pyworld + +def compute_pitch_variation(file_path): + # Step 1: Load audio + y, sr = librosa.load(file_path, sr=None) + + # Step 2: Extract pitch using pyworld + _f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr) + f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr) + + # Step 3: Filter voiced frames + voiced_f0 = f0[f0 > 0] + + # Remove outliers (5th to 95th percentile) + voiced_f0 = voiced_f0[ + (voiced_f0 > np.percentile(voiced_f0, 5)) & + (voiced_f0 < np.percentile(voiced_f0, 95)) + ] + + if voiced_f0.size == 0: + return { + "pitch_mean": 0.0, + "pitch_std": 0.0, + "pitch_range": 0.0, + "semitone_std": 0.0, + "pitch_variation_score": 0.0 + } + + # Step 4: Basic statistics + pitch_mean = float(np.mean(voiced_f0)) + pitch_std = float(np.std(voiced_f0)) + pitch_range = float(np.max(voiced_f0) - np.min(voiced_f0)) + + # Step 5: Semitone-based variation + median_f0 = np.median(voiced_f0) + if median_f0 <= 0: + median_f0 = 1e-6 + semitone_diffs = 12 * np.log2(voiced_f0 / median_f0) + semitone_std = float(np.std(semitone_diffs)) + + # Step 6: Scaled variation score + pitch_variation_score = float(np.clip((semitone_std / 6.0) * 100, 0, 100)) + + return { + "pitch_mean": pitch_mean, + "pitch_std": pitch_std, + "pitch_range": pitch_range, + "semitone_std": semitone_std, + "pitch_variation_score": pitch_variation_score + } + + +def compute_intonation_range(file_path): + # Step 1: Load audio + y, sr = librosa.load(file_path, sr=None) + + # Step 2: Extract pitch using pyworld + _f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr) + f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr) + + # Step 3: Filter voiced frames + voiced_f0 = f0[f0 > 0] + if voiced_f0.size == 0: + return 0.0, 0.0 + + # Remove outliers + voiced_f0 = voiced_f0[ + (voiced_f0 > np.percentile(voiced_f0, 5)) & + (voiced_f0 < np.percentile(voiced_f0, 95)) + ] + if voiced_f0.size == 0: + return 0.0, 0.0 + + # Step 4: Compute intonation range + f0_min = np.min(voiced_f0) + f0_max = np.max(voiced_f0) + if f0_min <= 0: + f0_min = 1e-6 + intonation_range = 12 * np.log2(f0_max / f0_min) + + # Step 5: Normalize + max_range = 12.0 + normalized = min(intonation_range, max_range) / max_range + score = normalized * 100 + + return round(score, 2), float(intonation_range) + + +def compute_speech_rhythm_variability(file_path): + """ + Computes the speech rhythm variability score from an audio file. + The method estimates tempo consistency across time using onset intervals. + """ + y, sr = librosa.load(file_path, sr=None) + + # Step 2: Onset detection + onset_env = librosa.onset.onset_strength(y=y, sr=sr) + onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time') + + if len(onsets) < 2: + return 0.0, 0.0 + + iois = np.diff(onsets) + + ioi_clean = iois[(iois > np.percentile(iois, 5)) & (iois < np.percentile(iois, 95))] + if len(ioi_clean) < 2: + return 0.0, 0.0 + + raw_std = np.std(ioi_clean) + + min_std = 0.05 + max_std = 0.6 + clamped_std = np.clip(raw_std, min_std, max_std) + normalized = 1 - (clamped_std - min_std) / (max_std - min_std) + score = normalized * 100 + + return round(score, 2), round(float(raw_std), 4) + + +def calc_sds(file_path): + pitch_variation = compute_pitch_variation(file_path) + intonation_range = compute_intonation_range(file_path) + speech_rhythm_variability = compute_speech_rhythm_variability(file_path) + + sds = 0.35 * pitch_variation['pitch_variation_score'] + \ + 0.35 * intonation_range[0] + \ + 0.3 * speech_rhythm_variability[0] + + return round(sds, 2) diff --git a/tone_modulation/tone_api.py b/tone_modulation/tone_api.py new file mode 100644 index 0000000000000000000000000000000000000000..5a59ec029454e5eef1c768ab55415c04edf564de --- /dev/null +++ b/tone_modulation/tone_api.py @@ -0,0 +1,23 @@ + +from .sds import calc_sds + +import logging +logger = logging.getLogger(__name__) + +def main(file_path: str) -> dict: + logger.info(f"Starting tone analysis for: {file_path}") + try: + + + results = calc_sds(file_path) + + # Structure response + response = { + "speech_dynamism_score" : round(results, 2), + } + logger.info("Tone analysis complete") + return response + + except Exception as e: + logger.error(f"Tone analysis failed internally: {e}", exc_info=True) + raise RuntimeError(f"Error during analysis: {str(e)}") \ No newline at end of file diff --git a/transcribe.py b/transcribe.py new file mode 100644 index 0000000000000000000000000000000000000000..211a092dd0ad757691de19fbcbf56e3041d7322f --- /dev/null +++ b/transcribe.py @@ -0,0 +1,24 @@ +# using whisper to transcribe audio files + +import whisper +import os + +def transcribe_audio(file_path, model_size="base"): + """ + Transcribe audio file using Whisper model. + + Args: + file_path (str): Path to the audio file. + model_size (str): Size of the Whisper model to use. Options are "tiny", "base", "small", "medium", "large". + + Returns: + str: Transcription of the audio file. + """ + # Load the Whisper model + model = whisper.load_model(model_size) + + # Transcribe the audio file + result = model.transcribe(file_path, fp16=False) + + # Return the transcription + return result["text"] \ No newline at end of file diff --git a/vcs/__init__.py b/vcs/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/vcs/__pycache__/__init__.cpython-312.pyc b/vcs/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..65a543bb27d68627d356bbd1bb46096f0a8e3005 Binary files /dev/null and b/vcs/__pycache__/__init__.cpython-312.pyc differ diff --git a/vcs/__pycache__/compute_vcs.cpython-312.pyc b/vcs/__pycache__/compute_vcs.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a03707a3f1a37d2028b59c27a84460cf889fd2e5 Binary files /dev/null and b/vcs/__pycache__/compute_vcs.cpython-312.pyc differ diff --git a/vcs/__pycache__/main.cpython-312.pyc b/vcs/__pycache__/main.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..46325ebdf675ad8b1c4367ea66016deb4ba0146b Binary files /dev/null and b/vcs/__pycache__/main.cpython-312.pyc differ diff --git a/vcs/__pycache__/vcs.cpython-312.pyc b/vcs/__pycache__/vcs.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b612456c4c869dbc07fcda8547955b4d347e168e Binary files /dev/null and b/vcs/__pycache__/vcs.cpython-312.pyc differ diff --git a/vcs/__pycache__/vcs_api.cpython-312.pyc b/vcs/__pycache__/vcs_api.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bd619aed9a3a9305bc7862e728e95678d804091c Binary files /dev/null and b/vcs/__pycache__/vcs_api.cpython-312.pyc differ diff --git a/vcs/compute_vcs.py b/vcs/compute_vcs.py new file mode 100644 index 0000000000000000000000000000000000000000..33affc9fac48f9b813bec1da140046c6c1bae321 --- /dev/null +++ b/vcs/compute_vcs.py @@ -0,0 +1,117 @@ +""" +Compute Voice Clarity Score from audio file +""" + +import librosa +import numpy as np +from typing import Dict, Any +from .vcs import calculate_voice_clarity_score, get_clarity_insight + +def compute_voice_clarity_score(file_path: str, whisper_model) -> Dict[str, Any]: + """ + Compute Voice Clarity Score and its components from a speech sample. + + Args: + file_path (str): Path to the audio file. + whisper_model: Transcription model (e.g., OpenAI Whisper or faster-whisper) + + Returns: + dict: A dictionary containing Voice Clarity Score and component scores. + """ + # Transcribe audio + result = whisper_model.transcribe(file_path) + transcript = result.get("text", "").strip() + segments = result.get("segments", []) + + # Validate early + if not transcript or not segments: + raise ValueError("Empty transcript or segments from Whisper.") + + # Load audio + y, sr = librosa.load(file_path, sr=None) + duration = len(y) / sr if sr else 0.0 + if duration <= 0: + raise ValueError("Audio duration invalid or zero.") + + # Calculate Voice Clarity Score + clarity_result = calculate_voice_clarity_score(y, sr, segments) + + # Add transcript to results + clarity_result["transcript"] = transcript + + # Add word count and duration info for reference + word_count = len(transcript.split()) + clarity_result["components"]["word_count"] = word_count + clarity_result["components"]["duration"] = duration + + return clarity_result + +def analyze_voice_quality(file_path: str, whisper_model) -> Dict[str, Any]: + """ + Comprehensive voice quality analysis including clarity. + + Args: + file_path (str): Path to the audio file + whisper_model: Transcription model + + Returns: + Dict[str, Any]: Complete voice quality analysis + """ + # Get Voice Clarity Score + clarity_results = compute_voice_clarity_score(file_path, whisper_model) + vcs = clarity_results["VCS"] + + # Load audio for additional analysis + y, sr = librosa.load(file_path, sr=None) + + # Calculate additional voice quality metrics + + # Voice stability - based on pitch (F0) stability + f0, voiced_flags, voiced_probs = librosa.pyin( + y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan) + voiced_f0 = f0[~np.isnan(f0)] + + pitch_stability = 0.0 + if voiced_f0.size > 0: + # Calculate coefficient of variation (lower is more stable) + cv = np.std(voiced_f0) / np.mean(voiced_f0) if np.mean(voiced_f0) > 0 else float('inf') + # Convert to score (0-100) + pitch_stability = max(0, min(100, 100 - (cv * 100))) + + # Voice resonance - based on spectral bandwidth + bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr)) + # Normalize (ideal range is around 1500-2500 Hz for speech) + if bandwidth < 1000: + resonance_score = max(0, bandwidth / 1000 * 70) # Too narrow + elif bandwidth <= 2500: + resonance_score = 70 + ((bandwidth - 1000) / 1500 * 30) # Optimal range + else: + resonance_score = max(0, 100 - ((bandwidth - 2500) / 2500 * 50)) # Too wide + + # Voice strength - based on RMS energy + rms = np.mean(librosa.feature.rms(y=y)) + # Normalize (typical speech RMS values range from 0.01 to 0.2) + strength_score = min(100, max(0, rms / 0.2 * 100)) + + # Combine additional metrics + additional_metrics = { + "pitch_stability": pitch_stability, + "voice_resonance": resonance_score, + "voice_strength": strength_score + } + + # Add to results + combined_results = { + "VCS": vcs, + "insight": clarity_results["insight"], + "components": { + **clarity_results["components"], + **additional_metrics + }, + "transcript": clarity_results["transcript"] + } + + return combined_results + +# Ensure the functions are exposed when imported +__all__ = ['compute_voice_clarity_score', 'analyze_voice_quality'] \ No newline at end of file diff --git a/vcs/main.py b/vcs/main.py new file mode 100644 index 0000000000000000000000000000000000000000..fcc9a7641862986cfcccd963d970d3da65dc4570 --- /dev/null +++ b/vcs/main.py @@ -0,0 +1,49 @@ +import json +import whisper +from .compute_vcs import analyze_voice_quality + +def main(): + """ + Main function to run voice clarity analysis on audio files + """ + # Fixed parameters - modify these values directly in the code + audio_file = r"D:\Intern\shankh\audio_samples\obama_short.wav" # Path to your audio file + model_size = "base" # Whisper model size (tiny, base, small, medium, large) + verbose = True # Whether to print detailed results + + try: + # Load whisper model + print(f"Loading Whisper model ({model_size})...") + whisper_model = whisper.load_model(model_size) + + # Calculate voice clarity score + print(f"Analyzing voice clarity for {audio_file}...") + results = analyze_voice_quality(audio_file, whisper_model) + + # Print summary results + print("\nVoice Quality Analysis Results:") + print(f"- Voice Clarity Score (VCS): {results['VCS']:.2f}/100") + print(f"- Insight: {results['insight']}") + print(f"- Articulation: {results['components']['articulation']:.2f}/100") + print(f"- Enunciation: {results['components']['enunciation']:.2f}/100") + print(f"- Speech Pause Control: {results['components']['speech_pause_control']:.2f}/100") + + # Print verbose results if enabled + if verbose: + print("\nDetailed Metrics:") + print(f"- Pitch Stability: {results['components']['pitch_stability']:.2f}/100") + print(f"- Voice Resonance: {results['components']['voice_resonance']:.2f}/100") + print(f"- Voice Strength: {results['components']['voice_strength']:.2f}/100") + print(f"- Word Count: {results['components']['word_count']}") + print(f"- Duration: {results['components']['duration']:.2f} seconds") + + # Print first 100 characters of transcript + transcript_preview = results['transcript'][:] + "..." if len(results['transcript']) > 100 else results['transcript'] + print(f"\nTranscript preview: {transcript_preview}") + + except Exception as e: + print(f"Error during analysis: {str(e)}") + return 1 + +if __name__ == "__main__": + exit(main()) \ No newline at end of file diff --git a/vcs/vcs.py b/vcs/vcs.py new file mode 100644 index 0000000000000000000000000000000000000000..98eca0e34280ae382e49e2af94f478e7d4b9151d --- /dev/null +++ b/vcs/vcs.py @@ -0,0 +1,176 @@ +""" +Voice Clarity Score calculation module +""" + +import librosa +import numpy as np +from typing import Dict, Any, List +import soundfile as sf + +def calculate_articulation(y: np.ndarray, sr: int) -> float: + """ + Calculate articulation quality based on spectral contrast. + + Articulation refers to how clearly individual phonemes are produced. + + Args: + y (np.ndarray): Audio signal + sr (int): Sample rate + + Returns: + float: Articulation score (0-100) + """ + # Extract spectral contrast + # Higher contrast between peaks and valleys in the spectrum generally correlates with clearer articulation + S = np.abs(librosa.stft(y)) + contrast = librosa.feature.spectral_contrast(S=S, sr=sr) + + # Average across frequency bands and frames + mean_contrast = np.mean(contrast) + + # Normalize to 0-100 scale (empirically determined range) + # Typical values range from 10-50 dB + min_contrast = 10 + max_contrast = 50 + normalized_contrast = min(100, max(0, (mean_contrast - min_contrast) / (max_contrast - min_contrast) * 100)) + + return normalized_contrast + +def calculate_enunciation(y: np.ndarray, sr: int) -> float: + """ + Calculate enunciation quality based on formant clarity and spectral flatness. + + Enunciation is the precision in pronouncing vowels and consonants. + + Args: + y (np.ndarray): Audio signal + sr (int): Sample rate + + Returns: + float: Enunciation score (0-100) + """ + # Compute spectral flatness - lower values indicate clearer formants and better enunciation + flatness = np.mean(librosa.feature.spectral_flatness(y=y)) + + # Compute spectral centroid - related to "brightness" or articulation clarity + centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)) + + # Normalize flatness (lower is better for speech) - range typically 0.01-0.5 + norm_flatness = max(0, min(100, (0.5 - flatness) / 0.5 * 100)) + + # Normalize centroid (mid-range is better for clear speech) - typically 1000-4000 Hz for clear speech + ideal_centroid = 2500 # Hz + centroid_deviation = abs(centroid - ideal_centroid) / 2000 # Normalized by expected deviation + norm_centroid = max(0, min(100, (1 - centroid_deviation) * 100)) + + # Combine the two metrics (with more weight on flatness) + enunciation_score = (0.7 * norm_flatness) + (0.3 * norm_centroid) + + return enunciation_score + +def calculate_speech_pause_control(segments: List[Dict]) -> float: + """ + Calculate how effectively pauses are integrated in speech. + + Speech pause control refers to the natural vs. abrupt pauses in speech. + + Args: + segments (List[Dict]): List of transcript segments with timing information + + Returns: + float: Speech pause control score (0-100) + """ + if len(segments) < 2: + return 100.0 # Not enough segments to evaluate pauses + + pause_durations = [] + for i in range(len(segments) - 1): + pause_dur = segments[i + 1]["start"] - segments[i]["end"] + if pause_dur > 0.05: # Only consider actual pauses + pause_durations.append(pause_dur) + + if not pause_durations: + return 100.0 # No significant pauses detected + + # Calculate the standard deviation of pause durations + # More consistent pauses indicate better control + pause_std = np.std(pause_durations) + + # Calculate proportion of very long pauses (potentially awkward) + long_pauses = sum(1 for d in pause_durations if d > 2.0) + long_pause_ratio = long_pauses / len(pause_durations) if pause_durations else 0 + + # Normalize std dev (lower is better, but not too low) + # Ideal range is around 0.2-0.5 seconds + if pause_std < 0.1: + std_score = 70 # Too consistent might sound robotic + elif pause_std < 0.5: + std_score = 100 - ((pause_std - 0.1) / 0.4 * 30) # Scale 70-100 + else: + std_score = max(0, 70 - ((pause_std - 0.5) / 2.0 * 70)) # Scale down from 70 + + # Penalize for too many long pauses + long_pause_penalty = long_pause_ratio * 50 + + # Final score + pause_control_score = max(0, min(100, std_score - long_pause_penalty)) + + return pause_control_score + +def calculate_voice_clarity_score(y: np.ndarray, sr: int, segments: List[Dict]) -> Dict[str, Any]: + """ + Calculate the Voice Clarity Score (VCS) and its components. + + VCS reflects the clarity and intelligibility of speech. + + Args: + y (np.ndarray): Audio signal + sr (int): Sample rate + segments (List[Dict]): List of transcript segments with timing information + + Returns: + Dict[str, Any]: Dictionary with VCS and component scores + """ + # Calculate component scores + articulation_score = calculate_articulation(y, sr) + enunciation_score = calculate_enunciation(y, sr) + speech_pause_control_score = calculate_speech_pause_control(segments) + + # Calculate Voice Clarity Score using the formula from the paper + vcs = (0.45 * articulation_score) + (0.35 * enunciation_score) + (0.2 * speech_pause_control_score) + + # Create result dictionary + result = { + "VCS": vcs, + "components": { + "articulation": articulation_score, + "enunciation": enunciation_score, + "speech_pause_control": speech_pause_control_score + } + } + + # Add interpretation + result["insight"] = get_clarity_insight(vcs) + + return result + +def get_clarity_insight(vcs: float) -> str: + """ + Generate insight text based on the Voice Clarity Score. + + Args: + vcs (float): Voice Clarity Score (0-100) + + Returns: + str: Insight text explaining the score + """ + if vcs >= 85: + return "Excellent voice clarity with precise articulation and well-controlled pauses. Speech is highly intelligible and pleasant to listen to." + elif vcs >= 70: + return "Good voice clarity with clear pronunciation and generally appropriate pauses. Minor improvements could enhance overall clarity." + elif vcs >= 50: + return "Moderate voice clarity with some articulation issues or irregular pauses. Focus on clearer pronunciation and more natural pausing." + elif vcs >= 30: + return "Below average clarity with noticeable articulation problems or awkward pausing patterns. Consider speech exercises to improve clarity." + else: + return "Speech clarity needs significant improvement. Articulation is unclear and pausing patterns disrupt intelligibility. Speech therapy exercises may be beneficial." \ No newline at end of file diff --git a/vcs/vcs_api.py b/vcs/vcs_api.py new file mode 100644 index 0000000000000000000000000000000000000000..fcf1d636cfbce50b29ffc9b080491573ac995ba0 --- /dev/null +++ b/vcs/vcs_api.py @@ -0,0 +1,21 @@ +import whisper +from .compute_vcs import analyze_voice_quality + +def main(file_path: str, model_size: str = "base") -> dict: + try: + + whisper_model = whisper.load_model(model_size) + + results = analyze_voice_quality(file_path, whisper_model) + + # Structure response + response = { + "Voice Clarity Sore": round(results['VCS'], 2) + # "Articulation": round(results['components']['articulation'],2), + # "Enunciation": round(results['components']['enunciation'],2), + # "Speech Pause Control": round(results['components']['speech_pause_control'],2), + } + return response + + except Exception as e: + raise RuntimeError(f"Error during analysis: {str(e)}") diff --git a/vers/__init__.py b/vers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/vers/__pycache__/__init__.cpython-312.pyc b/vers/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ca7715ec79e60ffec669381a54283286f28ac53 Binary files /dev/null and b/vers/__pycache__/__init__.cpython-312.pyc differ diff --git a/vers/__pycache__/compute_vers_score.cpython-312.pyc b/vers/__pycache__/compute_vers_score.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ae686d8b2282cc1560fb62c21af56c56ed639018 Binary files /dev/null and b/vers/__pycache__/compute_vers_score.cpython-312.pyc differ diff --git a/vers/__pycache__/filler_analyzer.cpython-312.pyc b/vers/__pycache__/filler_analyzer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2df2fa39e3c178d968e88fa6ccd9a21d91c4f1bc Binary files /dev/null and b/vers/__pycache__/filler_analyzer.cpython-312.pyc differ diff --git a/vers/__pycache__/find_valence.cpython-312.pyc b/vers/__pycache__/find_valence.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..78c60f1141163b69c49345d13608b94147de580d Binary files /dev/null and b/vers/__pycache__/find_valence.cpython-312.pyc differ diff --git a/vers/__pycache__/main.cpython-312.pyc b/vers/__pycache__/main.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e6d673f6560aaeda413e262e9aad5e43a795ca96 Binary files /dev/null and b/vers/__pycache__/main.cpython-312.pyc differ diff --git a/vers/__pycache__/vers.cpython-312.pyc b/vers/__pycache__/vers.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2cc3058d2e107ec4f48ef18731ab43366e2edff6 Binary files /dev/null and b/vers/__pycache__/vers.cpython-312.pyc differ diff --git a/vers/__pycache__/vers_api.cpython-312.pyc b/vers/__pycache__/vers_api.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7549a08005ec81933ed721ef59fd02755142ca09 Binary files /dev/null and b/vers/__pycache__/vers_api.cpython-312.pyc differ diff --git a/vers/compute_vers_score.py b/vers/compute_vers_score.py new file mode 100644 index 0000000000000000000000000000000000000000..f4c8a14e3dab0e0c6dd5121796ef430b7db071eb --- /dev/null +++ b/vers/compute_vers_score.py @@ -0,0 +1,85 @@ +from .vers import calc_vers +import librosa +import numpy as np +import math +from .filler_analyzer import detect_fillers +from .find_valence import get_valence_score + +def compute_vers_score(file_path: str, whisper_model) -> dict: + """ + Compute VERS (Vocal Emotional Regulation Score) and its components from a speech sample. + """ + result = whisper_model.transcribe(file_path) + transcript = result.get("text", "").strip() + segments = result.get("segments", []) + + + + # Filler count + filler_count, _ = detect_fillers(transcript) + + # Load audio + y, sr = librosa.load(file_path, sr=None) + duration = len(y) / sr if sr else 0.0 + + # Volume (RMS) + rms = librosa.feature.rms(y=y)[0] + mean_rms = float(np.mean(rms)) + mean_volume_db = 20 * math.log10(mean_rms + 1e-6) if mean_rms > 0 else -80.0 + volume_std = np.std(20 * np.log10(rms + 1e-6)) + + # Max volume + vol_max = np.max(np.abs(y)) if y.size > 0 else 0.0 + vol_max_db = 20 * math.log10(vol_max + 1e-6) if vol_max > 0 else -80.0 + + # Pitch variation + f0, voiced_flags, voiced_probs = librosa.pyin( + y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan) + voiced_f0 = f0[~np.isnan(f0)] + pitch_variation = 0.0 + if voiced_f0.size > 0: + median_f0 = np.nanmedian(voiced_f0) + median_f0 = max(median_f0, 1e-6) + semitone_diffs = 12 * np.log2(voiced_f0 / median_f0) + pitch_variation = float(np.nanstd(semitone_diffs)) + + # Pause analysis + total_speaking_time = 0.0 + long_pause_count = 0 + if segments: + for seg in segments: + total_speaking_time += (seg["end"] - seg["start"]) + for i in range(len(segments) - 1): + pause_dur = segments[i+1]["start"] - segments[i]["end"] + if pause_dur > 1.0: + long_pause_count += 1 + first_start = segments[0]["start"] + last_end = segments[-1]["end"] + if first_start > 1.0: + long_pause_count += 1 + if duration - last_end > 1.0: + long_pause_count += 1 + + # WPM + words = transcript.split() + word_count = len(words) + words_per_min = (word_count / duration) * 60.0 if duration > 0 else 0.0 + + + valence_scores = get_valence_score(file_path) + + # Calculate VERS + vers_result = calc_vers( + filler_count=filler_count, + long_pause_count=long_pause_count, + pitch_variation=pitch_variation, + mean_volume_db=mean_volume_db, + vol_max_db=vol_max_db, + wpm=words_per_min, + volume_std=volume_std, + valence_scores=valence_scores + ) + + # Include transcript optionally + vers_result["transcript"] = transcript + return vers_result diff --git a/vers/filler_analyzer.py b/vers/filler_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..9db9c83861b928c0663b0b00655c06e5c5484f12 --- /dev/null +++ b/vers/filler_analyzer.py @@ -0,0 +1,101 @@ +# Define filler words for English, Hindi, Tamil (in both Latin and native scripts) +# Mapping each variant to a common label (usually the Latin script for insight reporting) +FILLER_VARIANTS = { + # English fillers + "um": "um", "uh": "uh", "hmm": "hmm", "ah": "ah", "er": "er", + "umm": "um", "uhh": "uh", "mmm": "hmm", + "like": "like", "you know": "you know", "so": "so", "well": "well", + # Hindi fillers (Devanagari and transliteration) + "मतलब": "matlab", "matlab": "matlab", + "क्या कहते हैं": "kya kehte hain", "kya kehte hain": "kya kehte hain", + "वो ना": "wo na", "woh na": "wo na", "wo na": "wo na", + "ऐसा है": "aisa hai", "aisa hai": "aisa hai", + "हाँ": "haan", "haan": "haan", "हा": "haan", # "हा" might appear as a shorter "haan" + "अच्छा": "acha", "acha": "acha", + # Tamil fillers (Tamil script and transliteration) + "பாத்தீங்கனா": "paatheenga-na", "paatheenga na": "paatheenga-na", "paatheenga-na": "paatheenga-na", + "அப்பரம்": "apparam", "apparam": "apparam", + "என்ன": "enna", "enna": "enna" +} + +def detect_fillers(transcript): + """ + Detects filler words in the transcript. + + Args: + transcript: Full transcript text + + Returns: + tuple: (filler_count, filler_occurrences) + """ + transcript_lower = transcript.lower() + filler_count = 0 + # Track which specific fillers were used (for insight examples) + filler_occurrences = {} + + for variant, label in FILLER_VARIANTS.items(): + if variant in transcript_lower: + count = transcript_lower.count(variant) + if count > 0: + filler_count += count + # Accumulate count for the normalized label + filler_occurrences[label] = filler_occurrences.get(label, 0) + count + + return filler_count, filler_occurrences + + +def analyze_filler_words(filler_count, filler_occurrences, duration): + """ + Analyzes filler word usage in speech. + + Args: + filler_count: Total count of filler words + filler_occurrences: Dictionary of specific filler words and their counts + duration: Duration of the audio in seconds + + Returns: + dict: Contains the filler words score and insight text + """ + # Extract top examples for insights + filler_examples = [] + if filler_occurrences: + # Sort by frequency + sorted_fillers = sorted(filler_occurrences.items(), key=lambda x: x[1], reverse=True) + for label, count in sorted_fillers[:2]: + filler_examples.append(label) + + # Compute fillers per minute as a gauge + filler_per_min = (filler_count / duration) * 60.0 if duration > 0 else 0.0 + + if filler_count == 0: + filler_score = 10 + elif filler_per_min < 1: + filler_score = 9 + elif filler_per_min < 3: + filler_score = 8 + elif filler_per_min < 5: + filler_score = 6 + elif filler_per_min < 10: + filler_score = 4 + else: + filler_score = 2 + + filler_score = max(0, filler_score) + + # Generate insight text based on the score and examples + if filler_count == 0: + insight = "No filler words (um, ah, etc.) were detected, keeping the speech very clear." + elif filler_count <= 2: + example = filler_examples[0] if filler_examples else "um" + insight = f"Only a couple of filler words (e.g., '{example}') were used, which had minimal impact." + elif filler_count <= 5: + examples = ", ".join(f"'{ex}'" for ex in filler_examples) if filler_examples else "filler words" + insight = f"Some filler words {examples} were used occasionally; reducing them could improve clarity." + else: + examples = ", ".join(f"'{ex}'" for ex in filler_examples) if filler_examples else "'um'" + insight = f"Frequent filler words such as {examples} were detected, which can distract the audience and suggest uncertainty." + + return { + "score": int(filler_score), + "insight": insight + } \ No newline at end of file diff --git a/vers/find_valence.py b/vers/find_valence.py new file mode 100644 index 0000000000000000000000000000000000000000..6d34558dc2063857d89cd8007717bf8908fd1c86 --- /dev/null +++ b/vers/find_valence.py @@ -0,0 +1,100 @@ +# from transformers.models.wav2vec2 import Wav2Vec2Model, Wav2Vec2FeatureExtractor +# import torchaudio +# import torch +# import torch.nn as nn + + + +def get_valence_score(file_path): + # class VADPredictor(nn.Module): + # """Model to predict VAD Scores""" + # def __init__(self, pretrained_model_name="facebook/wav2vec2-base-960h", freeze_feature_extractor=True): + # super(VADPredictor, self).__init__() + + # self.wav2vec2 = Wav2Vec2Model.from_pretrained(pretrained_model_name) + + # if freeze_feature_extractor: + # for param in self.wav2vec2.feature_extractor.parameters(): + # param.requires_grad = False + + # hidden_size = self.wav2vec2.config.hidden_size + + # self.valence_layers = nn.Sequential( + # nn.Linear(hidden_size, 256), + # nn.ReLU(), + # nn.Dropout(0.3), + # nn.Linear(256,64), + # nn.Linear(64,1) + # ) + # self.arousal_layers = nn.Sequential( + # nn.Linear(hidden_size, 256), + # nn.ReLU(), + # nn.Dropout(0.3), + # nn.Linear(256,64), + # nn.Linear(64,1) + # ) + # self.dominance_layers = nn.Sequential( + # nn.Linear(hidden_size, 256), + # nn.ReLU(), + # nn.Dropout(0.3), + # nn.Linear(256,64), + # nn.Linear(64,1) + # ) + + # def forward(self, input_values, attention_mask=None): + # outputs = self.wav2vec2(input_values, attention_mask=attention_mask) + # last_hidden_state = outputs.last_hidden_state + # pooled_output = torch.mean(last_hidden_state, dim=1) + + # valence = self.valence_layers(pooled_output) + # arousal = self.arousal_layers(pooled_output) + # dominance = self.dominance_layers(pooled_output) + + # return { + # 'valence': valence.squeeze(-1), + # 'arousal': arousal.squeeze(-1), + # 'dominance': dominance.squeeze(-1) + # } + + + # model = VADPredictor() + # model.load_state_dict(torch.load(r"D:\Intern\shankh\DUMP\vad_predictor_model.pt", map_location=torch.device("cpu"))) + # model.eval() + + # feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h") + + # # Load and process audio + # file_path = file_path + # waveform, sr = torchaudio.load(file_path) + + # # Convert to mono + # if waveform.shape[0] > 1: + # waveform = waveform.mean(dim=0, keepdim=True) + + # # Resample to 16000 Hz + # if sr != 16000: + # resampler = torchaudio.transforms.Resample(sr, 16000) + # waveform = resampler(waveform) + # sr = 16000 + + # # Normalize + # waveform = waveform / waveform.abs().max() + + # # Parameters + # segment_sec = 1 + # segment_samples = int(segment_sec * sr) + + # valence_scores = [] + + # # Inference per segment + # with torch.no_grad(): + # for start in range(0, waveform.shape[1] - segment_samples + 1, segment_samples): + # segment = waveform[:, start:start+segment_samples] + # input_values = feature_extractor(segment.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values + # output = model(input_values) + # val = output['valence'].item() + # valence_scores.append(val) + valence_scores = 5.0 + + return valence_scores + diff --git a/vers/main.py b/vers/main.py new file mode 100644 index 0000000000000000000000000000000000000000..f1c7749db582f8154dbc2904c2b85d84101676f3 --- /dev/null +++ b/vers/main.py @@ -0,0 +1,16 @@ + +from .compute_vers_score import compute_vers_score +import whisper + + + +whisper_model = whisper.load_model("base") + +test_result = compute_vers_score(r"D:\Intern\shankh\audio_samples\obama_short.wav", whisper_model) + +print("VERS Score:", test_result["VERS"]) +print("ESS:", test_result["ESS"]) +print("LCS:", test_result["LCS"]) +print("SRS:", test_result["SRS"]) +print("Insight:", test_result["insight"]) +print("Transcript:", test_result["transcript"]) diff --git a/vers/vers.py b/vers/vers.py new file mode 100644 index 0000000000000000000000000000000000000000..4a207a6abd9ea3b66e7d4f5f508ffda1060c808c --- /dev/null +++ b/vers/vers.py @@ -0,0 +1,118 @@ +import numpy as np + +def calc_ess(pitch_variation, vol_max_db, mean_volume_db, valence_scores): + """ + Emotional Stability Score(ESS) : Measures the consistency of the speaker's emotional tone, reflecting their ability to regulate emotions during speech. + + Requires: + Tonal Steadiness: The lack of extreme fluctuations in emotional tone. + Absence of Sudden Loudness Spikes: Indicates controlled expression without abrupt emotional shifts. + Valence Stability: Consistency in the overall positive or negative tone across the speech. + """ + # calculate tonal steadiness + tonal_steadiness = max(0, 100 - (pitch_variation * 10)) + + # calculate loudness spikes + spike = max(0, vol_max_db - mean_volume_db - 15) + spike_ratio = min(spike / 30, 1.0) # Normalize with typical loudness range + stability = 1 - spike_ratio + loudness_stability = stability * 100 + + # calculate valence stability + valence_stability = 100 - (np.std(valence_scores) * 20) + + ESS = (0.45 * float(tonal_steadiness)) + (0.35 * float(loudness_stability)) + (0.2 * float(valence_stability)) + print(f" tonal_steadiness: {tonal_steadiness}, loudness_stability: {loudness_stability}, valence_stability: {valence_stability}") + return ESS + +def calc_lcs(volume_std, vol_max_db, mean_volume_db): + """ + Loudness Control Score (LCS): Evaluates how well the speaker manages volume + + Requires: + - Volume Stability: Consistency in speech amplitude. + - Controlled Emphasis: The ability to modulate loudness smoothly for emphasis rather than abrupt changes. + """ + vol_stability = max(0, 100 - (volume_std * 5)) # Scale std for speech (5 dB std = 75) + + # Controlled Emphasis (45%) + emphasis_spike = max(0, vol_max_db - mean_volume_db - 3) + spike_ratio = min(emphasis_spike / 15, 1.0) # Normalize to 15 dB range + emphasis_control = (1 - spike_ratio) * 100 + + # Combine scores + lcs = 0.55 * vol_stability + 0.45 * emphasis_control + print(f"vol_stability: {vol_stability}, emphasis_control: {emphasis_control}") + return min(100, max(0, lcs)) + +def calc_srs(wpm, filler_count, long_pause_count, pitch_variation): + """ + Speech Rate Stability (SRS): Reflects the consistency of the speaker's pace and rhythm. + + Requires: + - Words per Minute Consistency: Regularity in speech speed. + - Absence of Sudden Speed Shifts: Smooth transitions without erratic tempo changes. + """ + ideal_wpm = 150 + wpm_deviation = min(30, abs(wpm - ideal_wpm)) # Cap at 30 WPM deviation + wpm_consistency = max(0, 100 - (wpm_deviation * 1.67)) # 100-50 for max deviation + + # Sudden Speech Shift Penalty + filler_penalty = min(filler_count / 10, 1.0) + pause_penalty = min(long_pause_count / 5, 1.0) + pitch_penalty = min(pitch_variation / 3.0, 1.0) # High variation → unstable + + # Combine into absence of sudden shifts + stability = (1 - ((filler_penalty + pause_penalty + pitch_penalty) / 3)) * 100 + + # Final SRS Score + SRS = (0.45 * wpm_consistency) + (0.55 * stability) + print(f"wpm_consistency: {wpm_consistency}, stability: {stability}") + return min(100, max(0, SRS)) + +def calc_vers(filler_count, long_pause_count, pitch_variation, mean_volume_db, vol_max_db, wpm, volume_std, valence_scores): + ESS = calc_ess(pitch_variation, vol_max_db, mean_volume_db, valence_scores) + LCS = calc_lcs(volume_std, vol_max_db, mean_volume_db) + SRS = calc_srs(wpm, filler_count, long_pause_count, pitch_variation) + + # Calculate the VERS score using the formula + VERS = (0.5 * ESS) + (0.3 * LCS) + (0.2 * SRS) # This would be value from 0 to 100 + + if VERS > 0 and VERS < 50: + insight = """Poor regulation—noticeable swings in tone and uncontrolled + emotional expression. Feedback: Consider exercises and professional + coaching to stabilize your emotional delivery.""" + elif VERS >= 50 and VERS < 80: + insight = """Moderate regulation—occasional fluctuations or abrupt changes. + Feedback: Work on smoothing out volume changes and maintaining a steady tone.""" + elif VERS >= 80 and VERS <= 100: + insight = """Excellent regulation—steady tone and controlled volume dynamics. + Feedback: Continue using techniques that maintain emotional balance.""" + else: + insight = "Invalid score calculated" + + return { + "VERS": int(VERS), + "ESS": round(ESS, 1), + "LCS": round(LCS, 1), + "SRS": round(SRS, 1), + "insight": insight + } + +# # Test input +# test_result = calc_vers( +# filler_count=4, +# long_pause_count=2, +# pitch_variation=3.2, +# mean_volume_db=65, +# vol_max_db=82, +# wpm=148, +# volume_std=4.1, +# valence_scores=[5.2, 5.5, 4.9] +# ) + +# print("VERS Score:", test_result["VERS"]) +# print("ESS:", test_result["ESS"]) +# print("LCS:", test_result["LCS"]) +# print("SRS:", test_result["SRS"]) +# print("Insight:", test_result["insight"]) \ No newline at end of file diff --git a/vers/vers_api.py b/vers/vers_api.py new file mode 100644 index 0000000000000000000000000000000000000000..304efe045939d0ea8b94d2b8b290fb23504d124c --- /dev/null +++ b/vers/vers_api.py @@ -0,0 +1,44 @@ +import whisper +import numpy as np +from .compute_vers_score import compute_vers_score + +def convert_numpy_types(obj): + """Convert NumPy types to Python native types for JSON serialization.""" + if isinstance(obj, np.integer): + return int(obj) + elif isinstance(obj, np.floating): + return float(obj) + elif isinstance(obj, np.ndarray): + return obj.tolist() + elif isinstance(obj, dict): + return {k: convert_numpy_types(v) for k, v in obj.items()} + elif isinstance(obj, list): + return [convert_numpy_types(i) for i in obj] + else: + return obj + +def main(file_path: str, model_size: str = "base") -> dict: + try: + # Load whisper model + whisper_model = whisper.load_model(model_size) + + # Compute VERS score + results = compute_vers_score(file_path, whisper_model) + + # Convert any NumPy types to native Python types + results = convert_numpy_types(results) + + # Structure response with rounded values + # (using Python's built-in round function which returns Python native float) + response = { + "VERS Score": round(results['VERS'], 2) + # "ESS": round(results['ESS'], 2), + # "LCS": round(results['LCS'], 2), + # "SRS": round(results['SRS'], 2), + # "Insight": results['insight'], + } + + return response + + except Exception as e: + raise RuntimeError(f"Error during analysis: {str(e)}") \ No newline at end of file diff --git a/ves/__init__.py b/ves/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ves/__pycache__/__init__.cpython-312.pyc b/ves/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8f56a2146d0c8633d32e5630212a5f9a290530ca Binary files /dev/null and b/ves/__pycache__/__init__.cpython-312.pyc differ diff --git a/ves/__pycache__/ves.cpython-312.pyc b/ves/__pycache__/ves.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee24dd581cb595943ae3065b909db3486d233944 Binary files /dev/null and b/ves/__pycache__/ves.cpython-312.pyc differ diff --git a/ves/ves.py b/ves/ves.py new file mode 100644 index 0000000000000000000000000000000000000000..db7b35e07adda37d45758ff133b275d09bed5265 --- /dev/null +++ b/ves/ves.py @@ -0,0 +1,26 @@ +# voice engagement score = 0.4 * valence + 0.3 * arousal + 0.3 * SDS +from tone_modulation.sds import calc_sds + +def get_valence_and_arousal(file_path): + + valence = 4.5 #we get this from model + + arousal = 3.2 #we get this from model + + return valence, arousal + + + +def calc_voice_engagement_score(file_path): + valence, arousal = get_valence_and_arousal(file_path) + + # Calculate SDS + + sds = calc_sds(file_path) + + ves = 0.4 * valence + 0.3 * arousal + 0.3 * sds + + return { + # "sds": sds, + "ves": ves + } \ No newline at end of file diff --git a/voice_confidence_score/__init__.py b/voice_confidence_score/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/voice_confidence_score/__pycache__/__init__.cpython-312.pyc b/voice_confidence_score/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ecc05c5be8cc0abc43a90fa6f090322bafd17a45 Binary files /dev/null and b/voice_confidence_score/__pycache__/__init__.cpython-312.pyc differ diff --git a/voice_confidence_score/__pycache__/main.cpython-312.pyc b/voice_confidence_score/__pycache__/main.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..970cd1c3c08430183e19a5ca90fc447092215180 Binary files /dev/null and b/voice_confidence_score/__pycache__/main.cpython-312.pyc differ diff --git a/voice_confidence_score/__pycache__/voice_confidence.cpython-312.pyc b/voice_confidence_score/__pycache__/voice_confidence.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0be2d91ea85d9c610e762a8ffcf03c247ee2acc7 Binary files /dev/null and b/voice_confidence_score/__pycache__/voice_confidence.cpython-312.pyc differ diff --git a/voice_confidence_score/__pycache__/voice_confidence_api.cpython-312.pyc b/voice_confidence_score/__pycache__/voice_confidence_api.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1302c831071e11829e8faf4d6cbdae24a6caea11 Binary files /dev/null and b/voice_confidence_score/__pycache__/voice_confidence_api.cpython-312.pyc differ diff --git a/voice_confidence_score/main.py b/voice_confidence_score/main.py new file mode 100644 index 0000000000000000000000000000000000000000..9e582c228c96d85bbc01d5b24e4125a54d72b46c --- /dev/null +++ b/voice_confidence_score/main.py @@ -0,0 +1,11 @@ +from .voice_confidence import calc_voice_confidence_score +import whisper + +model_size = "base" +whisper_model = whisper.load_model(model_size) + +audio_file = r"D:\Intern\shankh\audio_samples\obama_short.wav" + +result = calc_voice_confidence_score(audio_file, whisper_model) + +print(f"Voice Confidence Score: {result:.2f}") \ No newline at end of file diff --git a/voice_confidence_score/voice_confidence.py b/voice_confidence_score/voice_confidence.py new file mode 100644 index 0000000000000000000000000000000000000000..1184206d76c61806d083f7cd8284064f691fd67b --- /dev/null +++ b/voice_confidence_score/voice_confidence.py @@ -0,0 +1,38 @@ +# voice confidence score = 0.4 * dominance + 0.3 * scs + 0.3 * fluency. + +import whisper +from fluency.compute_fluency import compute_fluency_score +from vcs.compute_vcs import analyze_voice_quality + + +def calc_fluency_score(audio_path, whisper_model): + + # Calculate fluency score + print(f"Analyzing fluency for {audio_path}...") + results = compute_fluency_score(audio_path, whisper_model) + fluency_score = results['fluency_score'] + + return fluency_score + +def calc_vcs(audio_path, whisper_model): + + + # Calculate voice clarity score + print(f"Analyzing voice clarity for {audio_path}...") + results = analyze_voice_quality(audio_path, whisper_model) + vcs = results['VCS'] + + return vcs + +dominance = 5.6 # dummy for now i add later + +def calc_voice_confidence_score(audio_path, model): + + fluency_score = calc_fluency_score(audio_path, model) + vcs = calc_vcs(audio_path, model) + + # Calculate voice confidence score + voice_confidence_score = 0.4 * dominance + 0.3 * vcs + 0.3 * fluency_score + + return voice_confidence_score + diff --git a/voice_confidence_score/voice_confidence_api.py b/voice_confidence_score/voice_confidence_api.py new file mode 100644 index 0000000000000000000000000000000000000000..cd6b3ec671f803678b22e08e7212cf52b26d5c51 --- /dev/null +++ b/voice_confidence_score/voice_confidence_api.py @@ -0,0 +1,16 @@ +import whisper +from .voice_confidence import calc_voice_confidence_score + +def main(file_path: str, model_size: str = "base") -> dict: + try: + # Load the Whisper model + whisper_model = whisper.load_model(model_size) + + # Calculate the voice confidence score + result = calc_voice_confidence_score(file_path, whisper_model) + + # Return the result as a dictionary + return {"voice_confidence_score": round(result, 2)} + except Exception as e: + return {"error": str(e)} + diff --git a/vps/__init__.py b/vps/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/vps/__pycache__/__init__.cpython-312.pyc b/vps/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0a9b97e335aeea6df957c3d11c3ccbdda90743e9 Binary files /dev/null and b/vps/__pycache__/__init__.cpython-312.pyc differ diff --git a/vps/__pycache__/compute_vps_score.cpython-312.pyc b/vps/__pycache__/compute_vps_score.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1533ad1a4897db15fbbcb581e7bedfeb58acab46 Binary files /dev/null and b/vps/__pycache__/compute_vps_score.cpython-312.pyc differ diff --git a/vps/__pycache__/filler_analyzer.cpython-312.pyc b/vps/__pycache__/filler_analyzer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1cd5b97993b072eaba33916fc9b9e5f2fbc4ef78 Binary files /dev/null and b/vps/__pycache__/filler_analyzer.cpython-312.pyc differ diff --git a/vps/__pycache__/main.cpython-312.pyc b/vps/__pycache__/main.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..334d491c8a9c4ab50a62037bfad6e49ddf9ed0b1 Binary files /dev/null and b/vps/__pycache__/main.cpython-312.pyc differ diff --git a/vps/__pycache__/vps.cpython-312.pyc b/vps/__pycache__/vps.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e0bab3f8d9d7496bb7118f2cdd9af775b92e2807 Binary files /dev/null and b/vps/__pycache__/vps.cpython-312.pyc differ diff --git a/vps/__pycache__/vps_api.cpython-312.pyc b/vps/__pycache__/vps_api.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f185440db6b795894a63f7143bdcfc0ada12bb80 Binary files /dev/null and b/vps/__pycache__/vps_api.cpython-312.pyc differ diff --git a/vps/compute_vps_score.py b/vps/compute_vps_score.py new file mode 100644 index 0000000000000000000000000000000000000000..883bb25ca20ed7edea0f442250c172971a65e5dc --- /dev/null +++ b/vps/compute_vps_score.py @@ -0,0 +1,79 @@ +from .vps import calculate_vps # Your file where calc_srs, calculate_pas, calculate_rcs, calculate_vps live +import librosa +import numpy as np +import math +from .filler_analyzer import detect_fillers + +def compute_vps_score(file_path: str, whisper_model) -> dict: + """ + Compute VPS (Voice Pacing Score) and its components from a speech sample. + + Args: + file_path (str): Path to the audio file. + whisper_model: Transcription model (e.g., OpenAI Whisper or faster-whisper) + + Returns: + dict: A dictionary containing VPS, SRS, PAS, RCS, and component scores. + """ + # Transcribe + result = whisper_model.transcribe(file_path) + transcript = result.get("text", "").strip() + segments = result.get("segments", []) + + # Validate early + if not transcript or not segments: + raise ValueError("Empty transcript or segments from Whisper.") + + # Filler count + filler_count, _ = detect_fillers(transcript) + + # Load audio + y, sr = librosa.load(file_path, sr=None) + duration = len(y) / sr if sr else 0.0 + if duration <= 0: + raise ValueError("Audio duration invalid or zero.") + + # Pitch variation (in semitones) + f0, voiced_flags, voiced_probs = librosa.pyin( + y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan) + voiced_f0 = f0[~np.isnan(f0)] + pitch_variation = 0.0 + if voiced_f0.size > 0: + median_f0 = np.nanmedian(voiced_f0) + median_f0 = max(median_f0, 1e-6) + semitone_diffs = 12 * np.log2(voiced_f0 / median_f0) + pitch_variation = float(np.nanstd(semitone_diffs)) + + # Pause analysis + long_pause_count = 0 + if segments: + for i in range(len(segments) - 1): + pause_dur = segments[i + 1]["start"] - segments[i]["end"] + if pause_dur > 1.0: + long_pause_count += 1 + # Beginning and end + if segments[0]["start"] > 1.0: + long_pause_count += 1 + if duration - segments[-1]["end"] > 1.0: + long_pause_count += 1 + + # WPM + word_count = len(transcript.split()) + words_per_min = (word_count / duration) * 60.0 if duration > 0 else 0.0 + + # Calculate VPS and components + vps_result = calculate_vps( + transcript=transcript, + segments=segments, + filler_count=filler_count, + duration=duration, + wpm=words_per_min, + long_pause_count=long_pause_count, + pitch_variation=pitch_variation, + y=y, + sr=sr + ) + + # Include transcript optionally + vps_result["transcript"] = transcript + return vps_result diff --git a/vps/filler_analyzer.py b/vps/filler_analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..090853dcbea679429376dbc44ace524e671baf61 --- /dev/null +++ b/vps/filler_analyzer.py @@ -0,0 +1,100 @@ +# Define filler words for English, Hindi, Tamil (in both Latin and native scripts) +# Mapping each variant to a common label (usually the Latin script for insight reporting) +FILLER_VARIANTS = { + # English fillers + "um": "um", "uh": "uh", "hmm": "hmm", "ah": "ah", "er": "er", + "umm": "um", "uhh": "uh", "mmm": "hmm", + "like": "like", "you know": "you know", "so": "so", "well": "well", + # Hindi fillers (Devanagari and transliteration) + "मतलब": "matlab", "matlab": "matlab", + "क्या कहते हैं": "kya kehte hain", "kya kehte hain": "kya kehte hain", + "वो ना": "wo na", "woh na": "wo na", "wo na": "wo na", + "ऐसा है": "aisa hai", "aisa hai": "aisa hai", + "हाँ": "haan", "haan": "haan", "हा": "haan", # "हा" might appear as a shorter "haan" + "अच्छा": "acha", "acha": "acha", + # Tamil fillers (Tamil script and transliteration) + "பாத்தீங்கனா": "paatheenga-na", "paatheenga na": "paatheenga-na", "paatheenga-na": "paatheenga-na", + "அப்பரம்": "apparam", "apparam": "apparam", + "என்ன": "enna", "enna": "enna" +} + +def detect_fillers(transcript): + """ + Detects filler words in the transcript. + + Args: + transcript: Full transcript text + + Returns: + tuple: (filler_count, filler_occurrences) + """ + transcript_lower = transcript.lower() + filler_count = 0 + # Track which specific fillers were used (for insight examples) + filler_occurrences = {} + + for variant, label in FILLER_VARIANTS.items(): + if variant in transcript_lower: + count = transcript_lower.count(variant) + if count > 0: + filler_count += count + # Accumulate count for the normalized label + filler_occurrences[label] = filler_occurrences.get(label, 0) + count + + return filler_count, filler_occurrences + +def analyze_filler_words(filler_count, filler_occurrences, duration): + """ + Analyzes filler word usage in speech. + + Args: + filler_count: Total count of filler words + filler_occurrences: Dictionary of specific filler words and their counts + duration: Duration of the audio in seconds + + Returns: + dict: Contains the filler words score and insight text + """ + # Extract top examples for insights + filler_examples = [] + if filler_occurrences: + # Sort by frequency + sorted_fillers = sorted(filler_occurrences.items(), key=lambda x: x[1], reverse=True) + for label, count in sorted_fillers[:2]: + filler_examples.append(label) + + # Compute fillers per minute as a gauge + filler_per_min = (filler_count / duration) * 60.0 if duration > 0 else 0.0 + + if filler_count == 0: + filler_score = 10 + elif filler_per_min < 1: + filler_score = 9 + elif filler_per_min < 3: + filler_score = 8 + elif filler_per_min < 5: + filler_score = 6 + elif filler_per_min < 10: + filler_score = 4 + else: + filler_score = 2 + + filler_score = max(0, filler_score) + + # Generate insight text based on the score and examples + if filler_count == 0: + insight = "No filler words (um, ah, etc.) were detected, keeping the speech very clear." + elif filler_count <= 2: + example = filler_examples[0] if filler_examples else "um" + insight = f"Only a couple of filler words (e.g., '{example}') were used, which had minimal impact." + elif filler_count <= 5: + examples = ", ".join(f"'{ex}'" for ex in filler_examples) if filler_examples else "filler words" + insight = f"Some filler words {examples} were used occasionally; reducing them could improve clarity." + else: + examples = ", ".join(f"'{ex}'" for ex in filler_examples) if filler_examples else "'um'" + insight = f"Frequent filler words such as {examples} were detected, which can distract the audience and suggest uncertainty." + + return { + "score": int(filler_score), + "insight": insight + } \ No newline at end of file diff --git a/vps/main.py b/vps/main.py new file mode 100644 index 0000000000000000000000000000000000000000..a1bd91d1e21c4f288c6c8ddd4a8231cb800d6a22 --- /dev/null +++ b/vps/main.py @@ -0,0 +1,35 @@ +import whisper +from .compute_vps_score import compute_vps_score # Ensure this path is correct + +def main(): + # 🔧 Set your input audio file path here + audio_path = r"D:\Intern\shankh\audio_samples\obama_short.wav" + + # 🔧 Choose Whisper model (tiny, base, small, medium, large) + model_size = "base" + + print(f"Loading Whisper model: {model_size}") + whisper_model = whisper.load_model(model_size) + + print(f"Analyzing audio: {audio_path}") + try: + vps_result = compute_vps_score(audio_path, whisper_model) + + print("\n--- Voice Pacing Score (VPS) ---") + print(f"VPS Score: {vps_result['VPS']:.2f}") + print(f" - SRS (Speech Rate Stability): {vps_result['SRS']:.2f}") + print(f" - PAS (Pause Appropriateness): {vps_result['PAS']:.2f}") + print(f" - NPP: {vps_result['NPP']:.2f}") + print(f" - AFW: {vps_result['AFW']:.2f}") + print(f" - RCS (Rhythm Consistency): {vps_result['RCS']:.2f}") + print(f" - STR: {vps_result['STR']:.2f}") + print(f" - STW: {vps_result['STW']:.2f}") + + print("\nTranscript:") + print(vps_result["transcript"]) + + except Exception as e: + print(f"[Error] {e}") + +if __name__ == "__main__": + main() diff --git a/vps/vps.py b/vps/vps.py new file mode 100644 index 0000000000000000000000000000000000000000..322f27181efb55ba0d0fa998ddc324703c33e7aa --- /dev/null +++ b/vps/vps.py @@ -0,0 +1,185 @@ +from typing import List, Dict +import librosa +import numpy as np +import spacy +import math +from .filler_analyzer import detect_fillers + +def calc_srs(wpm: float, filler_count: int, long_pause_count: int, pitch_variation: float) -> float: + """ + Speech Rate Stability (SRS): Reflects the consistency of the speaker's pace and rhythm. + """ + ideal_wpm = 150 + wpm_deviation = min(30, abs(wpm - ideal_wpm)) + wpm_consistency = max(0, 100 - (wpm_deviation * 1.67)) + + filler_penalty = min(filler_count / 10, 1.0) + pause_penalty = min(long_pause_count / 5, 1.0) + pitch_penalty = min(pitch_variation / 3.0, 1.0) + + stability = (1 - ((filler_penalty + pause_penalty + pitch_penalty) / 3)) * 100 + SRS = (0.45 * wpm_consistency) + (0.55 * stability) + return min(100, max(0, SRS)) + +def calculate_pas(transcript: str, segments: List[Dict], filler_count: int, duration: float) -> Dict[str, float]: + """ + Calculate the Pause Appropriateness Score (PAS) and its components. + """ + if not transcript or not segments or duration <= 0: + raise ValueError("Transcript, segments, and duration must be valid") + + nlp = spacy.load("en_core_web_sm") + doc = nlp(transcript) + + words = transcript.split() + total_words = len(words) + if total_words == 0: + raise ValueError("No words found in transcript") + + filler_rate = filler_count / total_words if total_words > 0 else 0.0 + if filler_rate >= 0.10: + afw = 0.0 + elif filler_rate <= 0.0: + afw = 100.0 + else: + afw = 100.0 - (filler_rate * 1000) + afw = max(0.0, min(100.0, afw)) + + total_pauses = 0 + natural_pauses = 0 + segment_texts = [seg["text"].strip() for seg in segments] + segment_starts = [seg["start"] for seg in segments] + segment_ends = [seg["end"] for seg in segments] + + for i in range(len(segments) - 1): + pause_dur = segment_starts[i + 1] - segment_ends[i] + if pause_dur > 0.5: + total_pauses += 1 + if segment_texts[i] and segment_texts[i][-1] in ".!?,": + natural_pauses += 1 + + if segment_starts[0] > 0.5: + total_pauses += 1 + if duration - segment_ends[-1] > 0.5: + total_pauses += 1 + if segment_texts[-1] and segment_texts[-1][-1] in ".!?": + natural_pauses += 1 + + npp = 100.0 if total_pauses == 0 else (natural_pauses / total_pauses) * 100.0 + pas = (0.4 * npp) + (0.6 * afw) + + return { + "NPP": npp, + "AFW": afw, + "PAS": pas + } + +def calculate_rcs(y: np.ndarray, sr: int, segments: List[Dict], duration: float) -> Dict[str, float]: + """ + Calculate the Rhythm Consistency Score (RCS) and its components. + """ + if y.size == 0 or sr <= 0 or duration <= 0 or not segments: + raise ValueError("Audio signal, sampling rate, duration, and segments must be valid") + + onset_env = librosa.onset.onset_strength(y=y, sr=sr, hop_length=256) + onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time', hop_length=256) + + if len(onsets) > 1: + iois = np.diff(onsets) + ioi_std = np.std(iois) + ioi_std = min(max(ioi_std, 0.1), 0.5) + str_score = 100.0 * (0.5 - ioi_std) / (0.5 - 0.1) + str_score = max(0.0, min(100.0, str_score)) + else: + str_score = 100.0 + + total_transitions = 0 + smooth_transitions = 0 + pause_threshold = 0.3 + + for i in range(len(segments) - 1): + gap = segments[i + 1]["start"] - segments[i]["end"] + total_transitions += 1 + if gap <= pause_threshold: + smooth_transitions += 1 + + for segment in segments: + words = segment["text"].strip().split() + if len(words) > 1: + smooth_transitions += len(words) - 1 + total_transitions += len(words) - 1 + + stw = 100.0 if total_transitions == 0 else (smooth_transitions / total_transitions) * 100.0 + rcs = (0.5 * str_score) + (0.5 * stw) + + return { + "STR": str_score, + "STW": stw, + "RCS": rcs + } + +def calculate_vps( + transcript: str, + segments: List[Dict], + filler_count: int, + duration: float, + wpm: float, + long_pause_count: int, + pitch_variation: float, + y: np.ndarray, + sr: int +) -> Dict[str, float]: + """ + Calculate the Voice Pacing Score (VPS) and its components: + - SRS: Speech Rate Stability Score + - PAS: Pause Appropriateness Score + - RCS: Rhythm Consistency Score + - VPS = (0.5 * SRS) + (0.3 * PAS) + (0.2 * RCS) + + Args: + transcript (str): Transcribed text. + segments (List[Dict]): Whisper model segments with 'start', 'end', 'text'. + filler_count (int): Number of filler words. + duration (float): Audio duration (seconds). + wpm (float): Words per minute. + long_pause_count (int): Number of long pauses (>1.0s). + pitch_variation (float): Pitch variation in semitones. + y (np.ndarray): Audio signal. + sr (int): Sampling rate. + + Returns: + Dict[str, float]: Scores for SRS, PAS, RCS, VPS, and intermediates. + """ + # Validate inputs + if not transcript or not segments or duration <= 0 or y.size == 0 or sr <= 0: + raise ValueError("Invalid inputs") + + # Calculate SRS + srs = calc_srs(wpm, filler_count, long_pause_count, pitch_variation) + + # Calculate PAS + pas_result = calculate_pas(transcript, segments, filler_count, duration) + pas = pas_result["PAS"] + npp = pas_result["NPP"] + afw = pas_result["AFW"] + + # Calculate RCS + rcs_result = calculate_rcs(y, sr, segments, duration) + rcs = rcs_result["RCS"] + str_score = rcs_result["STR"] + stw = rcs_result["STW"] + + # Calculate VPS + vps = (0.5 * srs) + (0.3 * pas) + (0.2 * rcs) + vps = max(0.0, min(100.0, vps)) + + return { + "SRS": srs, + "PAS": pas, + "NPP": npp, + "AFW": afw, + "RCS": rcs, + "STR": str_score, + "STW": stw, + "VPS": vps + } \ No newline at end of file diff --git a/vps/vps_api.py b/vps/vps_api.py new file mode 100644 index 0000000000000000000000000000000000000000..42e10f2fa6ae093a470e619036f9e151bd67f351 --- /dev/null +++ b/vps/vps_api.py @@ -0,0 +1,25 @@ +import whisper +from .compute_vps_score import compute_vps_score + +def main(file_path: str, model_size: str = "base") -> dict: + try: + # Load the Whisper model + whisper_model = whisper.load_model(model_size) + + # Calculate the voice confidence score + result = compute_vps_score(file_path, whisper_model) + + # Return the result as a dictionary + return { + "VPS": result["VPS"] + # "SRS": result["SRS"], + # "PAS": result["PAS"], + # "NPP": result["NPP"], + # "AFW": result["AFW"], + # "RCS": result["RCS"], + # "STR": result["STR"], + # "STW": result["STW"] + } + except Exception as e: + return {"error": str(e)} +