diff --git a/README.md b/README.md
index bf438ef020e1a7f1ef32ea6a30613f865fb271b0..a2425ffff9dcda4b48915a50d3b725f38220471b 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 ---
-title: Fast Api
-emoji: 💻
-colorFrom: blue
-colorTo: purple
+title: Voice Deploy
+emoji: 🏢
+colorFrom: green
+colorTo: gray
 sdk: docker
 pinned: false
 license: mit
diff --git a/filler_count/__init__.py b/filler_count/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/filler_count/__pycache__/__init__.cpython-312.pyc b/filler_count/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..25b7a301e74978c0e6506a75cf1f94e03b9c5a50
Binary files /dev/null and b/filler_count/__pycache__/__init__.cpython-312.pyc differ
diff --git a/filler_count/__pycache__/filler_score.cpython-312.pyc b/filler_count/__pycache__/filler_score.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..342c09cc30d9ae79049c3790259eef572597c277
Binary files /dev/null and b/filler_count/__pycache__/filler_score.cpython-312.pyc differ
diff --git a/filler_count/filler_score.py b/filler_count/filler_score.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae208fcfedb5b453d58f877b0341c44dbd97f6dc
--- /dev/null
+++ b/filler_count/filler_score.py
@@ -0,0 +1,24 @@
+import re
+import whisper
+
+def analyze_fillers(file_path: str, model_size: str = "base") -> dict:
+    try:
+        FILLER_WORDS = ["um", "uh", "hmm", "ah", "er", "eh", "like", "you know", "well"]
+        
+        model = whisper.load_model(model_size)
+        result = model.transcribe(file_path, word_timestamps=False, fp16=False)
+        transcript = result["text"]
+        
+        pattern = r"\b(" + "|".join(FILLER_WORDS) + r")\b"
+        matches = re.findall(pattern, transcript.lower())
+        
+        filler_counts = {filler: matches.count(filler) for filler in FILLER_WORDS}
+        total_fillers = sum(filler_counts.values())
+    
+        return {
+            # "transcript": transcript,
+            "filler_counts": {k: v for k, v in filler_counts.items() if v > 0},
+            "total_fillers": total_fillers
+        }
+    except Exception as e:
+        raise RuntimeError(f"Error during analysis: {str(e)}")
\ No newline at end of file
diff --git a/fluency/__init__.py b/fluency/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb4f112a7786b5d2f56ec5804e2ec32443669823
--- /dev/null
+++ b/fluency/__init__.py
@@ -0,0 +1,13 @@
+# fluency/__init__.py
+from .fluency import calc_srs, calculate_pas, calculate_fluency, get_fluency_insight
+from .filler_analyzer import detect_fillers
+from .compute_fluency import compute_fluency_score
+
+__all__ = [
+    'calc_srs',
+    'calculate_pas',
+    'calculate_fluency',
+    'get_fluency_insight',
+    'detect_fillers',
+    'compute_fluency_score'
+]
\ No newline at end of file
diff --git a/fluency/__pycache__/__init__.cpython-312.pyc b/fluency/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e6e73030145775aeaad1a0a0f1d524fc00f8d56
Binary files /dev/null and b/fluency/__pycache__/__init__.cpython-312.pyc differ
diff --git a/fluency/__pycache__/compute_fluency.cpython-312.pyc b/fluency/__pycache__/compute_fluency.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..38ced3a7fca80d7c83c6a4193da1d9cc8a95aae4
Binary files /dev/null and b/fluency/__pycache__/compute_fluency.cpython-312.pyc differ
diff --git a/fluency/__pycache__/filler_analyzer.cpython-312.pyc b/fluency/__pycache__/filler_analyzer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..916bd6bbf6ef2a033b546e511061165d10eb1ba6
Binary files /dev/null and b/fluency/__pycache__/filler_analyzer.cpython-312.pyc differ
diff --git a/fluency/__pycache__/fluency.cpython-312.pyc b/fluency/__pycache__/fluency.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6e47449770f2e2428c6e0fd46ddb7ef3ee6cb385
Binary files /dev/null and b/fluency/__pycache__/fluency.cpython-312.pyc differ
diff --git a/fluency/__pycache__/fluency_api.cpython-312.pyc b/fluency/__pycache__/fluency_api.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..29d1a2e8c4c9c227d6c725029f9f6f45ef7658b7
Binary files /dev/null and b/fluency/__pycache__/fluency_api.cpython-312.pyc differ
diff --git a/fluency/__pycache__/main.cpython-312.pyc b/fluency/__pycache__/main.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..805180b9205c2030775471214c226de9ca5c8a4b
Binary files /dev/null and b/fluency/__pycache__/main.cpython-312.pyc differ
diff --git a/fluency/compute_fluency.py b/fluency/compute_fluency.py
new file mode 100644
index 0000000000000000000000000000000000000000..52fe6f50986adec73909bfe58e2e0758ed76ffcd
--- /dev/null
+++ b/fluency/compute_fluency.py
@@ -0,0 +1,106 @@
+"""
+Compute fluency score from audio file using SRS and PAS calculations
+"""
+
+import librosa
+import numpy as np
+from typing import Dict, Any, Union
+from .fluency import calc_srs, calculate_pas, calculate_fluency, get_fluency_insight
+from .filler_analyzer import detect_fillers
+
+def compute_fluency_score(file_path: str, whisper_model) -> Dict[str, Any]:
+    """
+    Compute fluency score and its components from a speech sample.
+
+    Args:
+        file_path (str): Path to the audio file.
+        whisper_model: Transcription model (e.g., OpenAI Whisper or faster-whisper)
+
+    Returns:
+        dict: A dictionary containing fluency score, SRS, PAS, and component scores.
+    """
+    # Transcribe audio
+    result = whisper_model.transcribe(file_path)
+    transcript = result.get("text", "").strip()
+    segments = result.get("segments", [])
+
+    # Validate early
+    if not transcript or not segments:
+        raise ValueError("Empty transcript or segments from Whisper.")
+
+    # Detect filler words
+    filler_count, _ = detect_fillers(transcript)
+
+    # Load audio
+    y, sr = librosa.load(file_path, sr=None)
+    duration = len(y) / sr if sr else 0.0
+    if duration <= 0:
+        raise ValueError("Audio duration invalid or zero.")
+
+    # Calculate pitch variation (in semitones)
+    f0, voiced_flags, voiced_probs = librosa.pyin(
+        y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan)
+    voiced_f0 = f0[~np.isnan(f0)]
+    pitch_variation = 0.0
+    if voiced_f0.size > 0:
+        median_f0 = np.nanmedian(voiced_f0)
+        median_f0 = max(median_f0, 1e-6)
+        semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
+        pitch_variation = float(np.nanstd(semitone_diffs))
+
+    # Analyze pauses
+    long_pause_count = 0
+    if segments:
+        for i in range(len(segments) - 1):
+            pause_dur = segments[i + 1]["start"] - segments[i]["end"]
+            if pause_dur > 1.0:
+                long_pause_count += 1
+        # Check beginning and end pauses
+        if segments[0]["start"] > 1.0:
+            long_pause_count += 1
+        if duration - segments[-1]["end"] > 1.0:
+            long_pause_count += 1
+
+    # Calculate WPM
+    word_count = len(transcript.split())
+    words_per_min = (word_count / duration) * 60.0 if duration > 0 else 0.0
+
+    # Calculate SRS - Speech Rate Stability
+    srs_score = calc_srs(
+        wpm=words_per_min,
+        filler_count=filler_count,
+        long_pause_count=long_pause_count,
+        pitch_variation=pitch_variation
+    )
+
+    # Calculate PAS - Pause Appropriateness Score
+    pas_result = calculate_pas(
+        transcript=transcript,
+        segments=segments,
+        filler_count=filler_count,
+        duration=duration
+    )
+    pas_score = pas_result["PAS"]
+
+    # Calculate final fluency score
+    fluency_result = calculate_fluency(srs=srs_score, pas=pas_score)
+    fluency_score = fluency_result["score"]
+    insight = get_fluency_insight(fluency_score)
+
+    # Build and return comprehensive result
+    return {
+        "fluency_score": fluency_score,
+        "insight": insight,
+        "SRS": srs_score,
+        "PAS": pas_score,
+        "components": {
+            "wpm": words_per_min,
+            "filler_count": filler_count,
+            "long_pause_count": long_pause_count,
+            "pitch_variation": pitch_variation,
+            "word_count": word_count,
+            "duration": duration,
+            "pas_components": pas_result
+        },
+        "transcript": transcript
+    }
\ No newline at end of file
diff --git a/fluency/filler_analyzer.py b/fluency/filler_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..090853dcbea679429376dbc44ace524e671baf61
--- /dev/null
+++ b/fluency/filler_analyzer.py
@@ -0,0 +1,100 @@
+# Define filler words for English, Hindi, Tamil (in both Latin and native scripts)
+# Mapping each variant to a common label (usually the Latin script for insight reporting)
+FILLER_VARIANTS = {
+    # English fillers
+    "um": "um", "uh": "uh", "hmm": "hmm", "ah": "ah", "er": "er",
+    "umm": "um", "uhh": "uh", "mmm": "hmm",
+    "like": "like", "you know": "you know", "so": "so", "well": "well",
+    # Hindi fillers (Devanagari and transliteration)
+    "मतलब": "matlab", "matlab": "matlab",
+    "क्या कहते हैं": "kya kehte hain", "kya kehte hain": "kya kehte hain",
+    "वो ना": "wo na", "woh na": "wo na", "wo na": "wo na",
+    "ऐसा है": "aisa hai", "aisa hai": "aisa hai",
+    "हाँ": "haan", "haan": "haan", "हा": "haan",  # "हा" might appear as a shorter "haan"
+    "अच्छा": "acha", "acha": "acha",
+    # Tamil fillers (Tamil script and transliteration)
+    "பாத்தீங்கனா": "paatheenga-na", "paatheenga na": "paatheenga-na", "paatheenga-na": "paatheenga-na",
+    "அப்பரம்": "apparam", "apparam": "apparam",
+    "என்ன": "enna", "enna": "enna"
+}
+
+def detect_fillers(transcript):
+    """
+    Detects filler words in the transcript.
+    
+    Args:
+        transcript: Full transcript text
+        
+    Returns:
+        tuple: (filler_count, filler_occurrences)
+    """
+    transcript_lower = transcript.lower()
+    filler_count = 0
+    # Track which specific fillers were used (for insight examples)
+    filler_occurrences = {}
+    
+    for variant, label in FILLER_VARIANTS.items():
+        if variant in transcript_lower:
+            count = transcript_lower.count(variant)
+            if count > 0:
+                filler_count += count
+                # Accumulate count for the normalized label
+                filler_occurrences[label] = filler_occurrences.get(label, 0) + count
+    
+    return filler_count, filler_occurrences
+
+def analyze_filler_words(filler_count, filler_occurrences, duration):
+    """
+    Analyzes filler word usage in speech.
+    
+    Args:
+        filler_count: Total count of filler words
+        filler_occurrences: Dictionary of specific filler words and their counts
+        duration: Duration of the audio in seconds
+        
+    Returns:
+        dict: Contains the filler words score and insight text
+    """
+    # Extract top examples for insights
+    filler_examples = []
+    if filler_occurrences:
+        # Sort by frequency
+        sorted_fillers = sorted(filler_occurrences.items(), key=lambda x: x[1], reverse=True)
+        for label, count in sorted_fillers[:2]:
+            filler_examples.append(label)
+    
+    # Compute fillers per minute as a gauge
+    filler_per_min = (filler_count / duration) * 60.0 if duration > 0 else 0.0
+    
+    if filler_count == 0:
+        filler_score = 10
+    elif filler_per_min < 1:
+        filler_score = 9
+    elif filler_per_min < 3:
+        filler_score = 8
+    elif filler_per_min < 5:
+        filler_score = 6
+    elif filler_per_min < 10:
+        filler_score = 4
+    else:
+        filler_score = 2
+        
+    filler_score = max(0, filler_score)
+    
+    # Generate insight text based on the score and examples
+    if filler_count == 0:
+        insight = "No filler words (um, ah, etc.) were detected, keeping the speech very clear."
+    elif filler_count <= 2:
+        example = filler_examples[0] if filler_examples else "um"
+        insight = f"Only a couple of filler words (e.g., '{example}') were used, which had minimal impact."
+    elif filler_count <= 5:
+        examples = ", ".join(f"'{ex}'" for ex in filler_examples) if filler_examples else "filler words"
+        insight = f"Some filler words {examples} were used occasionally; reducing them could improve clarity."
+    else:
+        examples = ", ".join(f"'{ex}'" for ex in filler_examples) if filler_examples else "'um'"
+        insight = f"Frequent filler words such as {examples} were detected, which can distract the audience and suggest uncertainty."
+    
+    return {
+        "score": int(filler_score),
+        "insight": insight
+    }
\ No newline at end of file
diff --git a/fluency/fluency.py b/fluency/fluency.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4677068bee6dadf2cc9b7904cd40f4d51489178
--- /dev/null
+++ b/fluency/fluency.py
@@ -0,0 +1,149 @@
+
+
+import spacy
+from typing import List, Dict
+
+def calc_srs(wpm, filler_count, long_pause_count, pitch_variation):
+    """
+    Speech Rate Stability (SRS): Reflects the consistency of the speaker's pace and rhythm.
+    
+    Args:
+        wpm (float): Words per minute
+        filler_count (int): Number of filler words ("um", "uh", etc.)
+        long_pause_count (int): Number of pauses longer than 1 second
+        pitch_variation (float): Standard deviation of pitch in semitones
+        
+    Returns:
+        float: SRS score between 0-100
+    
+    Requires:
+        - Words per Minute Consistency: Regularity in speech speed.
+        - Absence of Sudden Speed Shifts: Smooth transitions without erratic tempo changes.
+    """
+    ideal_wpm = 150
+    wpm_deviation = min(30, abs(wpm - ideal_wpm))  # Cap at 30 WPM deviation
+    wpm_consistency = max(0, 100 - (wpm_deviation * 1.67))  # 100-50 for max deviation
+
+    # Sudden Speech Shift Penalty
+    filler_penalty = min(filler_count / 10, 1.0)
+    pause_penalty = min(long_pause_count / 5, 1.0)
+    pitch_penalty = min(pitch_variation / 3.0, 1.0)  # High variation → unstable
+
+    # Combine into absence of sudden shifts
+    stability = (1 - ((filler_penalty + pause_penalty + pitch_penalty) / 3)) * 100
+
+    # Final SRS Score
+    SRS = (0.45 * wpm_consistency) + (0.55 * stability)
+    return min(100, max(0, SRS))
+
+
+def calculate_pas(transcript: str, segments: List[Dict], filler_count: int, duration: float) -> Dict[str, float]:
+    """
+    Calculate the Pause Appropriateness Score (PAS) and its components.
+    
+    Args:
+        transcript (str): Full transcript text
+        segments (List[Dict]): List of transcript segments with start/end times
+        filler_count (int): Number of filler words detected
+        duration (float): Total duration of audio in seconds
+        
+    Returns:
+        Dict[str, float]: Dictionary with NPP, AFW, and PAS scores
+    """
+    if not transcript or not segments or duration <= 0:
+        raise ValueError("Transcript, segments, and duration must be valid")
+    
+    nlp = spacy.load("en_core_web_sm")
+    doc = nlp(transcript)
+    
+    words = transcript.split()
+    total_words = len(words)
+    if total_words == 0:
+        raise ValueError("No words found in transcript")
+    
+    # Calculate Avoidance of Filler Words (AFW)
+    filler_rate = filler_count / total_words if total_words > 0 else 0.0
+    if filler_rate >= 0.10:
+        afw = 0.0
+    elif filler_rate <= 0.0:
+        afw = 100.0
+    else:
+        afw = 100.0 - (filler_rate * 1000)
+    afw = max(0.0, min(100.0, afw))
+    
+    # Calculate Natural Pause Placement (NPP)
+    total_pauses = 0
+    natural_pauses = 0
+    segment_texts = [seg["text"].strip() for seg in segments]
+    segment_starts = [seg["start"] for seg in segments]
+    segment_ends = [seg["end"] for seg in segments]
+    
+    for i in range(len(segments) - 1):
+        pause_dur = segment_starts[i + 1] - segment_ends[i]
+        if pause_dur > 0.5:
+            total_pauses += 1
+            if segment_texts[i] and segment_texts[i][-1] in ".!?,": 
+                natural_pauses += 1
+    
+    # Check initial and final pauses
+    if segment_starts[0] > 0.5:
+        total_pauses += 1
+    if duration - segment_ends[-1] > 0.5:
+        total_pauses += 1
+        if segment_texts[-1] and segment_texts[-1][-1] in ".!?":
+            natural_pauses += 1
+    
+    npp = 100.0 if total_pauses == 0 else (natural_pauses / total_pauses) * 100.0
+    
+    # Calculate final PAS
+    pas = (0.4 * npp) + (0.6 * afw)
+    
+    return {
+        "NPP": npp,
+        "AFW": afw,
+        "PAS": pas
+    }
+
+
+def calculate_fluency(srs: float, pas: float) -> Dict[str, float]:
+    """
+    Calculate fluency score based on Speech Rate Stability and Pause Appropriateness Score.
+    
+    Args:
+        srs (float): Speech Rate Stability score (0-100)
+        pas (float): Pause Appropriateness Score (0-100)
+        
+    Returns:
+        Dict[str, float]: Dictionary with fluency score (0-100) and component contributions
+    """
+    # Equal weighting of SRS and PAS for fluency
+    fluency_score = (0.5 * srs) + (0.5 * pas)
+    
+    
+    return {
+        "score": fluency_score,
+        "SRS_contribution": 0.5 * srs,
+        "PAS_contribution": 0.5 * pas
+    }
+
+
+def get_fluency_insight(fluency_score: float) -> str:
+    """
+    Generate insight text based on the fluency score.
+    
+    Args:
+        fluency_score (float): The calculated fluency score (0-100)
+        
+    Returns:
+        str: Insight text explaining the score
+    """
+    if fluency_score >= 85:
+        return "Excellent fluency with very consistent pacing and natural pauses. Speech flows effortlessly."
+    elif fluency_score >= 70:
+        return "Good fluency with generally stable speech rate and appropriate pauses. Some minor inconsistencies."
+    elif fluency_score >= 50:
+        return "Moderate fluency with occasional disruptions in speech flow. Consider working on pace stability and pause placement."
+    elif fluency_score >= 30:
+        return "Below average fluency with noticeable disruptions. Focus on reducing filler words and maintaining consistent pace."
+    else:
+        return "Speech fluency needs significant improvement. Work on maintaining consistent pace, reducing long pauses, and eliminating filler words."
\ No newline at end of file
diff --git a/fluency/fluency_api.py b/fluency/fluency_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f4a16314a616b8af2efa6761feabe12a63005c5
--- /dev/null
+++ b/fluency/fluency_api.py
@@ -0,0 +1,22 @@
+import whisper
+from .compute_fluency import compute_fluency_score
+
+def main(file_path: str, model_size: str = "base") -> dict:
+    try:
+
+        whisper_model = whisper.load_model(model_size)
+        
+        results = compute_fluency_score(file_path, whisper_model)
+
+        # Structure response
+        response = {
+            "fluency_score": round(results['fluency_score'], 2)
+            # "insight": results["insight"],
+            # "SRS": round(results["SRS"], 2),
+            # "PAS": round(results["PAS"], 2),
+            # "transcript": results["transcript"]
+        }
+        return response
+
+    except Exception as e:
+        raise RuntimeError(f"Error during analysis: {str(e)}")
diff --git a/fluency/main.py b/fluency/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..99a220d8c8214f19465ce1b13778b6a2ed067be4
--- /dev/null
+++ b/fluency/main.py
@@ -0,0 +1,49 @@
+import json
+import whisper
+from .compute_fluency import compute_fluency_score
+
+def main():
+    """
+    Main function to run fluency analysis on audio files
+    """
+    # Fixed parameters - modify these values directly in the code
+    audio_file = r"D:\Intern\shankh\audio_samples\obama_short.wav"  # Path to your audio file
+    model_size = "base"  # Whisper model size (tiny, base, small, medium, large)
+    verbose = True  # Whether to print detailed results
+    
+    try:
+        # Load whisper model
+        print(f"Loading Whisper model ({model_size})...")
+        whisper_model = whisper.load_model(model_size)
+        
+        # Calculate fluency score
+        print(f"Analyzing fluency for {audio_file}...")
+        results = compute_fluency_score(audio_file, whisper_model)
+        
+        # Print summary results
+        print("\nFluency Analysis Results:")
+        print(f"- Fluency Score: {results['fluency_score']:.2f}/100")
+        print(f"- Insight: {results['insight']}")
+        print(f"- Speech Rate Stability (SRS): {results['SRS']:.2f}/100")
+        print(f"- Pause Appropriateness (PAS): {results['PAS']:.2f}/100")
+        
+        # Print verbose results if enabled
+        if verbose:
+            print("\nDetailed Metrics:")
+            print(f"- Words per minute: {results['components']['wpm']:.1f}")
+            print(f"- Filler word count: {results['components']['filler_count']}")
+            print(f"- Long pauses: {results['components']['long_pause_count']}")
+            print(f"- Pitch variation: {results['components']['pitch_variation']:.2f} semitones")
+            print(f"- Natural Pause Placement: {results['components']['pas_components']['NPP']:.2f}/100")
+            print(f"- Avoidance of Filler Words: {results['components']['pas_components']['AFW']:.2f}/100")
+            
+            # Print first 100 characters of transcript
+            transcript_preview = results['transcript'][:] + "..." if len(results['transcript']) > 100 else results['transcript']
+            print(f"\nTranscript preview: {transcript_preview}")
+        
+    except Exception as e:
+        print(f"Error during analysis: {str(e)}")
+        return 1
+
+if __name__ == "__main__":
+    exit(main())
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 97dc7cd8c1fd2f07d8ec79a1117664c5ebaf2842..ecac929cc15b933ccb3cf49975a35ebd0695ce5d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,21 @@
+
 fastapi
 uvicorn
+python-multipart
+
+
+librosa
+soundfile
+pyworld
+scipy
+
+
+openai-whisper==20240930
+spacy==3.8.5
+en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl
+
+
+
+numpy
+tqdm
+requests
\ No newline at end of file
diff --git a/tone_modulation/__init__.py b/tone_modulation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tone_modulation/__pycache__/__init__.cpython-312.pyc b/tone_modulation/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..595162a2e38a9ee3e8ef1739f3f99258be7bc64e
Binary files /dev/null and b/tone_modulation/__pycache__/__init__.cpython-312.pyc differ
diff --git a/tone_modulation/__pycache__/sds.cpython-312.pyc b/tone_modulation/__pycache__/sds.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c66263bb21c8cddfe6a11b80b3126b9a302ed819
Binary files /dev/null and b/tone_modulation/__pycache__/sds.cpython-312.pyc differ
diff --git a/tone_modulation/__pycache__/tone_api.cpython-312.pyc b/tone_modulation/__pycache__/tone_api.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..57e50da06ffcd75eb1333f71852b4e1858f1ff7f
Binary files /dev/null and b/tone_modulation/__pycache__/tone_api.cpython-312.pyc differ
diff --git a/tone_modulation/sds.py b/tone_modulation/sds.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4166ed8b27f4f38b14a012ddbbd1c6e25c4fbff
--- /dev/null
+++ b/tone_modulation/sds.py
@@ -0,0 +1,385 @@
+
+import scipy.signal
+import numpy as np
+import librosa
+import pyworld as pw
+
+# def compute_pitch_variation(file_path):
+#     # Step 1: Load audio
+#     y, sr = librosa.load(file_path, sr=None)
+#     y = y.astype(np.float64)  # pyworld expects float64
+
+#     # Step 2: Extract pitch (F0)
+#     _f0, t = pw.dio(y, sr)              # Fast initial pitch estimation
+#     f0 = pw.stonemask(y, _f0, t, sr)    # Refinement step
+
+#     # Step 3: Filter voiced frames
+#     voiced_f0 = f0[f0 > 0]
+
+#     # Handle empty case
+#     if voiced_f0.size == 0:
+#         return {
+#             "pitch_mean": 0.0,
+#             "pitch_std": 0.0,
+#             "pitch_range": 0.0,
+#             "semitone_std": 0.0,
+#             "pitch_variation_score": 0.0
+#         }
+
+#     # Step 4: Basic statistics
+#     pitch_mean = np.mean(voiced_f0)
+#     pitch_std = np.std(voiced_f0)
+#     pitch_range = np.max(voiced_f0) - np.min(voiced_f0)
+
+#     print(pitch_mean)
+#     print(f'voiced_f0: {voiced_f0}')
+#     # Step 5: Compute semitone-based variation (better for human perception)
+#     median_f0 = np.median(voiced_f0)
+#     if median_f0 <= 0:
+#         median_f0 = 1e-6  # Avoid division by zero
+        
+#     semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
+#     semitone_std = np.std(semitone_diffs)
+#     print(semitone_std)
+
+#     # Step 6: Scale semitone_std to a 0–100 score (tunable)
+#     # For example: semitone_std of 0 → 0 score, ≥6 semitones → 100 score
+#     pitch_variation_score = np.clip((semitone_std / 6.0) * 100, 0, 100)
+
+#     return {
+#         "pitch_mean": pitch_mean,
+#         "pitch_std": pitch_std,
+#         "pitch_range": pitch_range,
+#         "semitone_std": semitone_std,
+#         "pitch_variation_score": pitch_variation_score
+#     }
+# def compute_intonation_range(file_path):
+#     # Step 1: Load and prepare audio
+#     y, sr = librosa.load(file_path, sr=None)
+#     y = y.astype(np.float64)
+
+#     # Step 2: Extract F0
+#     _f0, t = pw.dio(y, sr)
+#     f0 = pw.stonemask(y, _f0, t, sr)
+    
+   
+
+#     # Step 3: Filter voiced frames
+#     voiced_f0 = f0[f0 > 0]
+#     if voiced_f0.size == 0:
+#         return 0.0
+    
+#     voiced_f0 = voiced_f0[(voiced_f0 > np.percentile(voiced_f0, 5)) & 
+#                       (voiced_f0 < np.percentile(voiced_f0, 95))]
+
+#     # Step 4: Compute intonation range (in semitones)
+#     f0_min = np.min(voiced_f0)
+#     f0_max = np.max(voiced_f0)
+#     if f0_min <= 0:
+#         f0_min = 1e-6  # to avoid log error
+#     intonation_range = 12 * np.log2(f0_max / f0_min)
+    
+#     # range into scores:
+    
+#     max_range = 12.0
+#     normalized = min(intonation_range, max_range) / max_range
+#     score = normalized * 100
+#     return round(score, 2), intonation_range
+
+
+
+# def compute_pitch_variation(file_path):
+#     # Step 1: Load audio
+#     y, sr = librosa.load(file_path, sr=None)
+
+#     # Step 2: Extract pitch using librosa.pyin (YIN-based)
+#     f0, voiced_flags, voiced_probs = librosa.pyin(
+#         y,
+#         sr=sr,
+#         fmin=80,
+#         fmax=400,
+#         frame_length=1105,
+#         hop_length=256,
+#         fill_na=np.nan
+#     )
+
+#     # Step 3: Filter voiced frames
+#     voiced_f0 = f0[~np.isnan(f0)]
+ 
+ 
+#     voiced_f0 = voiced_f0[
+#         (voiced_f0 > np.percentile(voiced_f0, 5)) &
+#         (voiced_f0 < np.percentile(voiced_f0, 95))
+#     ]
+
+#     # Handle empty case
+#     if voiced_f0.size == 0:
+#         return {
+#             "pitch_mean": 0.0,
+#             "pitch_std": 0.0,
+#             "pitch_range": 0.0,
+#             "semitone_std": 0.0,
+#             "pitch_variation_score": 0.0
+#         }
+
+#     # Step 4: Basic statistics
+#     pitch_mean = float(np.mean(voiced_f0))
+#     pitch_std = float(np.std(voiced_f0))
+#     pitch_range = float(np.max(voiced_f0) - np.min(voiced_f0))
+
+
+#     # Step 5: Compute semitone-based variation
+#     median_f0 = np.median(voiced_f0)
+#     if median_f0 <= 0:
+#         median_f0 = 1e-6
+
+#     semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
+#     semitone_std = float(np.std(semitone_diffs))
+  
+
+#     # Step 6: Scale to 0–100 score
+#     pitch_variation_score = float(np.clip((semitone_std / 6.0) * 100, 0, 100))
+#     return {
+#         "pitch_mean": pitch_mean,
+#         "pitch_std": pitch_std,
+#         "pitch_range": pitch_range,
+#         "semitone_std": semitone_std,
+#         "pitch_variation_score": pitch_variation_score
+#     }
+
+# def compute_intonation_range(file_path):
+#     # Step 1: Load and prepare audio
+#     y, sr = librosa.load(file_path, sr=None)
+
+#     # Step 2: Extract F0 using librosa.pyin
+#     f0, voiced_flags, voiced_probs = librosa.pyin(
+#         y,
+#         sr=sr,
+#         fmin=80,
+#         fmax=400,
+#         frame_length=1105,  # ensures two periods of fmin fit
+#         hop_length=256,
+#         fill_na=np.nan
+#     )
+
+#     # Step 3: Filter voiced frames
+#     voiced_f0 = f0[~np.isnan(f0)]
+#     if voiced_f0.size == 0:
+#         return 0.0, 0.0
+
+#     # Optional: remove outliers (5th to 95th percentile)
+#     voiced_f0 = voiced_f0[
+#         (voiced_f0 > np.percentile(voiced_f0, 5)) &
+#         (voiced_f0 < np.percentile(voiced_f0, 95))
+#     ]
+
+#     # Step 4: Compute intonation range in semitones
+#     f0_min = np.min(voiced_f0)
+#     f0_max = np.max(voiced_f0)
+#     if f0_min <= 0:
+#         f0_min = 1e-6
+
+#     intonation_range = 12 * np.log2(f0_max / f0_min)
+
+#     # Step 5: Normalize and convert to score out of 100
+#     max_range = 12.0  # ~1 octave
+#     normalized = min(intonation_range, max_range) / max_range
+#     score = normalized * 100
+
+#     return round(score, 2), float(intonation_range)
+
+
+
+# def compute_speech_rhythm_variability(file_path):
+#     """
+#     Computes the speech rhythm variability score from an audio file.
+#     The method estimates tempo consistency across time using onset intervals.
+
+#     Returns:
+#         score (float): Normalized rhythm variability score out of 100.
+#         raw_std (float): Raw standard deviation of inter-onset intervals.
+#     """
+#     # Step 1: Load audio
+#     y, sr = librosa.load(file_path, sr=None)
+
+#     # Step 2: Onset detection 
+#     onset_env = librosa.onset.onset_strength(y=y, sr=sr)
+#     onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time')
+
+#     if len(onsets) < 2:
+#         return 0.0, 0.0  # Not enough onsets to compute rhythm
+
+#     # Step 3: Compute inter-onset intervals (IOIs) as rhythm proxy
+#     iois = np.diff(onsets)
+
+#     # Optional: Remove outliers (5th–95th percentile)
+#     ioi_clean = iois[(iois > np.percentile(iois, 5)) & (iois < np.percentile(iois, 95))]
+#     if len(ioi_clean) < 2:
+#         return 0.0, 0.0
+
+#     # Step 4: Compute variability — standard deviation of IOIs
+#     raw_std = np.std(ioi_clean)
+
+#     # Step 5: Normalize raw_std to 0–100 score
+#     # Lower std = more consistent rhythm → higher score
+#     min_std = 0.05   # near-perfect rhythm (tight pacing)
+#     max_std = 0.6    # highly irregular rhythm
+
+#     # Clamp and reverse-score
+#     clamped_std = np.clip(raw_std, min_std, max_std)
+#     normalized = 1 - (clamped_std - min_std) / (max_std - min_std)
+#     score = normalized * 100
+
+#     return round(score, 2), round(float(raw_std), 4)
+
+
+# def calc_sds(file_path):
+    
+#     # sds = 0.35 * pitch_variation + 0.35 * intonation_range + 0.3 * speech_rhythm_variability
+    
+#     pitch_variation = compute_pitch_variation(file_path)
+#     intonation_range = compute_intonation_range(file_path)
+#     speech_rhythm_variability = compute_speech_rhythm_variability(file_path)
+#     # print(f"Speech Rhythm Variability Score: {speech_rhythm_variability}")
+#     # print(f"Speech Rhythm Variability Score: {speech_rhythm_variability}")
+#     # print(f"Speech Rhythm Variability Score: {speech_rhythm_variability}")
+    
+#     sds = 0.35 * pitch_variation['pitch_variation_score'] + 0.35 * intonation_range[0] + 0.3 * speech_rhythm_variability[0]
+#     return round(sds, 2)
+
+# path = r'D:\Intern\shankh\audio_samples\anga.wav'
+
+# result = calc_sds(path)
+# print(f"SDS: {result}")
+
+import numpy as np
+import librosa
+import pyworld
+
+def compute_pitch_variation(file_path):
+    # Step 1: Load audio
+    y, sr = librosa.load(file_path, sr=None)
+
+    # Step 2: Extract pitch using pyworld
+    _f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr)
+    f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr)
+
+    # Step 3: Filter voiced frames
+    voiced_f0 = f0[f0 > 0]
+
+    # Remove outliers (5th to 95th percentile)
+    voiced_f0 = voiced_f0[
+        (voiced_f0 > np.percentile(voiced_f0, 5)) &
+        (voiced_f0 < np.percentile(voiced_f0, 95))
+    ]
+
+    if voiced_f0.size == 0:
+        return {
+            "pitch_mean": 0.0,
+            "pitch_std": 0.0,
+            "pitch_range": 0.0,
+            "semitone_std": 0.0,
+            "pitch_variation_score": 0.0
+        }
+
+    # Step 4: Basic statistics
+    pitch_mean = float(np.mean(voiced_f0))
+    pitch_std = float(np.std(voiced_f0))
+    pitch_range = float(np.max(voiced_f0) - np.min(voiced_f0))
+
+    # Step 5: Semitone-based variation
+    median_f0 = np.median(voiced_f0)
+    if median_f0 <= 0:
+        median_f0 = 1e-6
+    semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
+    semitone_std = float(np.std(semitone_diffs))
+
+    # Step 6: Scaled variation score
+    pitch_variation_score = float(np.clip((semitone_std / 6.0) * 100, 0, 100))
+
+    return {
+        "pitch_mean": pitch_mean,
+        "pitch_std": pitch_std,
+        "pitch_range": pitch_range,
+        "semitone_std": semitone_std,
+        "pitch_variation_score": pitch_variation_score
+    }
+
+
+def compute_intonation_range(file_path):
+    # Step 1: Load audio
+    y, sr = librosa.load(file_path, sr=None)
+
+    # Step 2: Extract pitch using pyworld
+    _f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr)
+    f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr)
+
+    # Step 3: Filter voiced frames
+    voiced_f0 = f0[f0 > 0]
+    if voiced_f0.size == 0:
+        return 0.0, 0.0
+
+    # Remove outliers
+    voiced_f0 = voiced_f0[
+        (voiced_f0 > np.percentile(voiced_f0, 5)) &
+        (voiced_f0 < np.percentile(voiced_f0, 95))
+    ]
+    if voiced_f0.size == 0:
+        return 0.0, 0.0
+
+    # Step 4: Compute intonation range
+    f0_min = np.min(voiced_f0)
+    f0_max = np.max(voiced_f0)
+    if f0_min <= 0:
+        f0_min = 1e-6
+    intonation_range = 12 * np.log2(f0_max / f0_min)
+
+    # Step 5: Normalize
+    max_range = 12.0
+    normalized = min(intonation_range, max_range) / max_range
+    score = normalized * 100
+
+    return round(score, 2), float(intonation_range)
+
+
+def compute_speech_rhythm_variability(file_path):
+    """
+    Computes the speech rhythm variability score from an audio file.
+    The method estimates tempo consistency across time using onset intervals.
+    """
+    y, sr = librosa.load(file_path, sr=None)
+
+    # Step 2: Onset detection
+    onset_env = librosa.onset.onset_strength(y=y, sr=sr)
+    onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time')
+
+    if len(onsets) < 2:
+        return 0.0, 0.0
+
+    iois = np.diff(onsets)
+
+    ioi_clean = iois[(iois > np.percentile(iois, 5)) & (iois < np.percentile(iois, 95))]
+    if len(ioi_clean) < 2:
+        return 0.0, 0.0
+
+    raw_std = np.std(ioi_clean)
+
+    min_std = 0.05
+    max_std = 0.6
+    clamped_std = np.clip(raw_std, min_std, max_std)
+    normalized = 1 - (clamped_std - min_std) / (max_std - min_std)
+    score = normalized * 100
+
+    return round(score, 2), round(float(raw_std), 4)
+
+
+def calc_sds(file_path):
+    pitch_variation = compute_pitch_variation(file_path)
+    intonation_range = compute_intonation_range(file_path)
+    speech_rhythm_variability = compute_speech_rhythm_variability(file_path)
+
+    sds = 0.35 * pitch_variation['pitch_variation_score'] + \
+          0.35 * intonation_range[0] + \
+          0.3 * speech_rhythm_variability[0]
+    
+    return round(sds, 2)
diff --git a/tone_modulation/tone_api.py b/tone_modulation/tone_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a59ec029454e5eef1c768ab55415c04edf564de
--- /dev/null
+++ b/tone_modulation/tone_api.py
@@ -0,0 +1,23 @@
+
+from .sds import calc_sds
+
+import logging
+logger = logging.getLogger(__name__)
+
+def main(file_path: str) -> dict:
+    logger.info(f"Starting tone analysis for: {file_path}")
+    try:
+
+        
+        results = calc_sds(file_path)
+
+        # Structure response
+        response = {
+            "speech_dynamism_score" : round(results, 2),
+        }
+        logger.info("Tone analysis complete")
+        return response
+
+    except Exception as e:
+        logger.error(f"Tone analysis failed internally: {e}", exc_info=True)
+        raise RuntimeError(f"Error during analysis: {str(e)}")
\ No newline at end of file
diff --git a/transcribe.py b/transcribe.py
new file mode 100644
index 0000000000000000000000000000000000000000..211a092dd0ad757691de19fbcbf56e3041d7322f
--- /dev/null
+++ b/transcribe.py
@@ -0,0 +1,24 @@
+# using whisper to transcribe audio files
+
+import whisper
+import os
+
+def transcribe_audio(file_path, model_size="base"):
+    """
+    Transcribe audio file using Whisper model.
+    
+    Args:
+        file_path (str): Path to the audio file.
+        model_size (str): Size of the Whisper model to use. Options are "tiny", "base", "small", "medium", "large".
+    
+    Returns:
+        str: Transcription of the audio file.
+    """
+    # Load the Whisper model
+    model = whisper.load_model(model_size)
+    
+    # Transcribe the audio file
+    result = model.transcribe(file_path, fp16=False)
+    
+    # Return the transcription
+    return result["text"]
\ No newline at end of file
diff --git a/vcs/__init__.py b/vcs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/vcs/__pycache__/__init__.cpython-312.pyc b/vcs/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65a543bb27d68627d356bbd1bb46096f0a8e3005
Binary files /dev/null and b/vcs/__pycache__/__init__.cpython-312.pyc differ
diff --git a/vcs/__pycache__/compute_vcs.cpython-312.pyc b/vcs/__pycache__/compute_vcs.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a03707a3f1a37d2028b59c27a84460cf889fd2e5
Binary files /dev/null and b/vcs/__pycache__/compute_vcs.cpython-312.pyc differ
diff --git a/vcs/__pycache__/main.cpython-312.pyc b/vcs/__pycache__/main.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..46325ebdf675ad8b1c4367ea66016deb4ba0146b
Binary files /dev/null and b/vcs/__pycache__/main.cpython-312.pyc differ
diff --git a/vcs/__pycache__/vcs.cpython-312.pyc b/vcs/__pycache__/vcs.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b612456c4c869dbc07fcda8547955b4d347e168e
Binary files /dev/null and b/vcs/__pycache__/vcs.cpython-312.pyc differ
diff --git a/vcs/__pycache__/vcs_api.cpython-312.pyc b/vcs/__pycache__/vcs_api.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd619aed9a3a9305bc7862e728e95678d804091c
Binary files /dev/null and b/vcs/__pycache__/vcs_api.cpython-312.pyc differ
diff --git a/vcs/compute_vcs.py b/vcs/compute_vcs.py
new file mode 100644
index 0000000000000000000000000000000000000000..33affc9fac48f9b813bec1da140046c6c1bae321
--- /dev/null
+++ b/vcs/compute_vcs.py
@@ -0,0 +1,117 @@
+"""
+Compute Voice Clarity Score from audio file
+"""
+
+import librosa
+import numpy as np
+from typing import Dict, Any
+from .vcs import calculate_voice_clarity_score, get_clarity_insight
+
+def compute_voice_clarity_score(file_path: str, whisper_model) -> Dict[str, Any]:
+    """
+    Compute Voice Clarity Score and its components from a speech sample.
+
+    Args:
+        file_path (str): Path to the audio file.
+        whisper_model: Transcription model (e.g., OpenAI Whisper or faster-whisper)
+
+    Returns:
+        dict: A dictionary containing Voice Clarity Score and component scores.
+    """
+    # Transcribe audio
+    result = whisper_model.transcribe(file_path)
+    transcript = result.get("text", "").strip()
+    segments = result.get("segments", [])
+
+    # Validate early
+    if not transcript or not segments:
+        raise ValueError("Empty transcript or segments from Whisper.")
+
+    # Load audio
+    y, sr = librosa.load(file_path, sr=None)
+    duration = len(y) / sr if sr else 0.0
+    if duration <= 0:
+        raise ValueError("Audio duration invalid or zero.")
+    
+    # Calculate Voice Clarity Score
+    clarity_result = calculate_voice_clarity_score(y, sr, segments)
+    
+    # Add transcript to results
+    clarity_result["transcript"] = transcript
+    
+    # Add word count and duration info for reference
+    word_count = len(transcript.split())
+    clarity_result["components"]["word_count"] = word_count
+    clarity_result["components"]["duration"] = duration
+    
+    return clarity_result
+
+def analyze_voice_quality(file_path: str, whisper_model) -> Dict[str, Any]:
+    """
+    Comprehensive voice quality analysis including clarity.
+    
+    Args:
+        file_path (str): Path to the audio file
+        whisper_model: Transcription model
+        
+    Returns:
+        Dict[str, Any]: Complete voice quality analysis
+    """
+    # Get Voice Clarity Score
+    clarity_results = compute_voice_clarity_score(file_path, whisper_model)
+    vcs = clarity_results["VCS"]
+    
+    # Load audio for additional analysis
+    y, sr = librosa.load(file_path, sr=None)
+    
+    # Calculate additional voice quality metrics
+    
+    # Voice stability - based on pitch (F0) stability
+    f0, voiced_flags, voiced_probs = librosa.pyin(
+        y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan)
+    voiced_f0 = f0[~np.isnan(f0)]
+    
+    pitch_stability = 0.0
+    if voiced_f0.size > 0:
+        # Calculate coefficient of variation (lower is more stable)
+        cv = np.std(voiced_f0) / np.mean(voiced_f0) if np.mean(voiced_f0) > 0 else float('inf')
+        # Convert to score (0-100)
+        pitch_stability = max(0, min(100, 100 - (cv * 100)))
+    
+    # Voice resonance - based on spectral bandwidth
+    bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
+    # Normalize (ideal range is around 1500-2500 Hz for speech)
+    if bandwidth < 1000:
+        resonance_score = max(0, bandwidth / 1000 * 70)  # Too narrow
+    elif bandwidth <= 2500:
+        resonance_score = 70 + ((bandwidth - 1000) / 1500 * 30)  # Optimal range
+    else:
+        resonance_score = max(0, 100 - ((bandwidth - 2500) / 2500 * 50))  # Too wide
+    
+    # Voice strength - based on RMS energy
+    rms = np.mean(librosa.feature.rms(y=y))
+    # Normalize (typical speech RMS values range from 0.01 to 0.2)
+    strength_score = min(100, max(0, rms / 0.2 * 100))
+    
+    # Combine additional metrics
+    additional_metrics = {
+        "pitch_stability": pitch_stability,
+        "voice_resonance": resonance_score,
+        "voice_strength": strength_score
+    }
+    
+    # Add to results
+    combined_results = {
+        "VCS": vcs,
+        "insight": clarity_results["insight"],
+        "components": {
+            **clarity_results["components"],
+            **additional_metrics
+        },
+        "transcript": clarity_results["transcript"]
+    }
+    
+    return combined_results
+
+# Ensure the functions are exposed when imported
+__all__ = ['compute_voice_clarity_score', 'analyze_voice_quality']
\ No newline at end of file
diff --git a/vcs/main.py b/vcs/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcc9a7641862986cfcccd963d970d3da65dc4570
--- /dev/null
+++ b/vcs/main.py
@@ -0,0 +1,49 @@
+import json
+import whisper
+from .compute_vcs import analyze_voice_quality
+
+def main():
+    """
+    Main function to run voice clarity analysis on audio files
+    """
+    # Fixed parameters - modify these values directly in the code
+    audio_file = r"D:\Intern\shankh\audio_samples\obama_short.wav"  # Path to your audio file
+    model_size = "base"  # Whisper model size (tiny, base, small, medium, large)
+    verbose = True  # Whether to print detailed results
+    
+    try:
+        # Load whisper model
+        print(f"Loading Whisper model ({model_size})...")
+        whisper_model = whisper.load_model(model_size)
+        
+        # Calculate voice clarity score
+        print(f"Analyzing voice clarity for {audio_file}...")
+        results = analyze_voice_quality(audio_file, whisper_model)
+        
+        # Print summary results
+        print("\nVoice Quality Analysis Results:")
+        print(f"- Voice Clarity Score (VCS): {results['VCS']:.2f}/100")
+        print(f"- Insight: {results['insight']}")
+        print(f"- Articulation: {results['components']['articulation']:.2f}/100")
+        print(f"- Enunciation: {results['components']['enunciation']:.2f}/100")
+        print(f"- Speech Pause Control: {results['components']['speech_pause_control']:.2f}/100")
+        
+        # Print verbose results if enabled
+        if verbose:
+            print("\nDetailed Metrics:")
+            print(f"- Pitch Stability: {results['components']['pitch_stability']:.2f}/100")
+            print(f"- Voice Resonance: {results['components']['voice_resonance']:.2f}/100")
+            print(f"- Voice Strength: {results['components']['voice_strength']:.2f}/100")
+            print(f"- Word Count: {results['components']['word_count']}")
+            print(f"- Duration: {results['components']['duration']:.2f} seconds")
+            
+            # Print first 100 characters of transcript
+            transcript_preview = results['transcript'][:] + "..." if len(results['transcript']) > 100 else results['transcript']
+            print(f"\nTranscript preview: {transcript_preview}")
+        
+    except Exception as e:
+        print(f"Error during analysis: {str(e)}")
+        return 1
+
+if __name__ == "__main__":
+    exit(main())
\ No newline at end of file
diff --git a/vcs/vcs.py b/vcs/vcs.py
new file mode 100644
index 0000000000000000000000000000000000000000..98eca0e34280ae382e49e2af94f478e7d4b9151d
--- /dev/null
+++ b/vcs/vcs.py
@@ -0,0 +1,176 @@
+"""
+Voice Clarity Score calculation module
+"""
+
+import librosa
+import numpy as np
+from typing import Dict, Any, List
+import soundfile as sf
+
+def calculate_articulation(y: np.ndarray, sr: int) -> float:
+    """
+    Calculate articulation quality based on spectral contrast.
+    
+    Articulation refers to how clearly individual phonemes are produced.
+    
+    Args:
+        y (np.ndarray): Audio signal
+        sr (int): Sample rate
+        
+    Returns:
+        float: Articulation score (0-100)
+    """
+    # Extract spectral contrast
+    # Higher contrast between peaks and valleys in the spectrum generally correlates with clearer articulation
+    S = np.abs(librosa.stft(y))
+    contrast = librosa.feature.spectral_contrast(S=S, sr=sr)
+    
+    # Average across frequency bands and frames
+    mean_contrast = np.mean(contrast)
+    
+    # Normalize to 0-100 scale (empirically determined range)
+    # Typical values range from 10-50 dB
+    min_contrast = 10
+    max_contrast = 50
+    normalized_contrast = min(100, max(0, (mean_contrast - min_contrast) / (max_contrast - min_contrast) * 100))
+    
+    return normalized_contrast
+
+def calculate_enunciation(y: np.ndarray, sr: int) -> float:
+    """
+    Calculate enunciation quality based on formant clarity and spectral flatness.
+    
+    Enunciation is the precision in pronouncing vowels and consonants.
+    
+    Args:
+        y (np.ndarray): Audio signal
+        sr (int): Sample rate
+        
+    Returns:
+        float: Enunciation score (0-100)
+    """
+    # Compute spectral flatness - lower values indicate clearer formants and better enunciation
+    flatness = np.mean(librosa.feature.spectral_flatness(y=y))
+    
+    # Compute spectral centroid - related to "brightness" or articulation clarity
+    centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
+    
+    # Normalize flatness (lower is better for speech) - range typically 0.01-0.5
+    norm_flatness = max(0, min(100, (0.5 - flatness) / 0.5 * 100))
+    
+    # Normalize centroid (mid-range is better for clear speech) - typically 1000-4000 Hz for clear speech
+    ideal_centroid = 2500  # Hz
+    centroid_deviation = abs(centroid - ideal_centroid) / 2000  # Normalized by expected deviation
+    norm_centroid = max(0, min(100, (1 - centroid_deviation) * 100))
+    
+    # Combine the two metrics (with more weight on flatness)
+    enunciation_score = (0.7 * norm_flatness) + (0.3 * norm_centroid)
+    
+    return enunciation_score
+
+def calculate_speech_pause_control(segments: List[Dict]) -> float:
+    """
+    Calculate how effectively pauses are integrated in speech.
+    
+    Speech pause control refers to the natural vs. abrupt pauses in speech.
+    
+    Args:
+        segments (List[Dict]): List of transcript segments with timing information
+        
+    Returns:
+        float: Speech pause control score (0-100)
+    """
+    if len(segments) < 2:
+        return 100.0  # Not enough segments to evaluate pauses
+    
+    pause_durations = []
+    for i in range(len(segments) - 1):
+        pause_dur = segments[i + 1]["start"] - segments[i]["end"]
+        if pause_dur > 0.05:  # Only consider actual pauses
+            pause_durations.append(pause_dur)
+    
+    if not pause_durations:
+        return 100.0  # No significant pauses detected
+    
+    # Calculate the standard deviation of pause durations
+    # More consistent pauses indicate better control
+    pause_std = np.std(pause_durations)
+    
+    # Calculate proportion of very long pauses (potentially awkward)
+    long_pauses = sum(1 for d in pause_durations if d > 2.0)
+    long_pause_ratio = long_pauses / len(pause_durations) if pause_durations else 0
+    
+    # Normalize std dev (lower is better, but not too low)
+    # Ideal range is around 0.2-0.5 seconds
+    if pause_std < 0.1:
+        std_score = 70  # Too consistent might sound robotic
+    elif pause_std < 0.5:
+        std_score = 100 - ((pause_std - 0.1) / 0.4 * 30)  # Scale 70-100
+    else:
+        std_score = max(0, 70 - ((pause_std - 0.5) / 2.0 * 70))  # Scale down from 70
+    
+    # Penalize for too many long pauses
+    long_pause_penalty = long_pause_ratio * 50
+    
+    # Final score
+    pause_control_score = max(0, min(100, std_score - long_pause_penalty))
+    
+    return pause_control_score
+
+def calculate_voice_clarity_score(y: np.ndarray, sr: int, segments: List[Dict]) -> Dict[str, Any]:
+    """
+    Calculate the Voice Clarity Score (VCS) and its components.
+    
+    VCS reflects the clarity and intelligibility of speech.
+    
+    Args:
+        y (np.ndarray): Audio signal
+        sr (int): Sample rate
+        segments (List[Dict]): List of transcript segments with timing information
+        
+    Returns:
+        Dict[str, Any]: Dictionary with VCS and component scores
+    """
+    # Calculate component scores
+    articulation_score = calculate_articulation(y, sr)
+    enunciation_score = calculate_enunciation(y, sr)
+    speech_pause_control_score = calculate_speech_pause_control(segments)
+    
+    # Calculate Voice Clarity Score using the formula from the paper
+    vcs = (0.45 * articulation_score) + (0.35 * enunciation_score) + (0.2 * speech_pause_control_score)
+    
+    # Create result dictionary
+    result = {
+        "VCS": vcs,
+        "components": {
+            "articulation": articulation_score,
+            "enunciation": enunciation_score,
+            "speech_pause_control": speech_pause_control_score
+        }
+    }
+    
+    # Add interpretation
+    result["insight"] = get_clarity_insight(vcs)
+    
+    return result
+
+def get_clarity_insight(vcs: float) -> str:
+    """
+    Generate insight text based on the Voice Clarity Score.
+    
+    Args:
+        vcs (float): Voice Clarity Score (0-100)
+        
+    Returns:
+        str: Insight text explaining the score
+    """
+    if vcs >= 85:
+        return "Excellent voice clarity with precise articulation and well-controlled pauses. Speech is highly intelligible and pleasant to listen to."
+    elif vcs >= 70:
+        return "Good voice clarity with clear pronunciation and generally appropriate pauses. Minor improvements could enhance overall clarity."
+    elif vcs >= 50:
+        return "Moderate voice clarity with some articulation issues or irregular pauses. Focus on clearer pronunciation and more natural pausing."
+    elif vcs >= 30:
+        return "Below average clarity with noticeable articulation problems or awkward pausing patterns. Consider speech exercises to improve clarity."
+    else:
+        return "Speech clarity needs significant improvement. Articulation is unclear and pausing patterns disrupt intelligibility. Speech therapy exercises may be beneficial."
\ No newline at end of file
diff --git a/vcs/vcs_api.py b/vcs/vcs_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcf1d636cfbce50b29ffc9b080491573ac995ba0
--- /dev/null
+++ b/vcs/vcs_api.py
@@ -0,0 +1,21 @@
+import whisper
+from .compute_vcs import analyze_voice_quality
+
+def main(file_path: str, model_size: str = "base") -> dict:
+    try:
+
+        whisper_model = whisper.load_model(model_size)
+        
+        results = analyze_voice_quality(file_path, whisper_model)
+
+        # Structure response
+        response = {
+            "Voice Clarity Sore": round(results['VCS'], 2)
+            # "Articulation": round(results['components']['articulation'],2),
+            # "Enunciation": round(results['components']['enunciation'],2),
+            # "Speech Pause Control": round(results['components']['speech_pause_control'],2),
+        }
+        return response
+
+    except Exception as e:
+        raise RuntimeError(f"Error during analysis: {str(e)}")
diff --git a/vers/__init__.py b/vers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/vers/__pycache__/__init__.cpython-312.pyc b/vers/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ca7715ec79e60ffec669381a54283286f28ac53
Binary files /dev/null and b/vers/__pycache__/__init__.cpython-312.pyc differ
diff --git a/vers/__pycache__/compute_vers_score.cpython-312.pyc b/vers/__pycache__/compute_vers_score.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae686d8b2282cc1560fb62c21af56c56ed639018
Binary files /dev/null and b/vers/__pycache__/compute_vers_score.cpython-312.pyc differ
diff --git a/vers/__pycache__/filler_analyzer.cpython-312.pyc b/vers/__pycache__/filler_analyzer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2df2fa39e3c178d968e88fa6ccd9a21d91c4f1bc
Binary files /dev/null and b/vers/__pycache__/filler_analyzer.cpython-312.pyc differ
diff --git a/vers/__pycache__/find_valence.cpython-312.pyc b/vers/__pycache__/find_valence.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..78c60f1141163b69c49345d13608b94147de580d
Binary files /dev/null and b/vers/__pycache__/find_valence.cpython-312.pyc differ
diff --git a/vers/__pycache__/main.cpython-312.pyc b/vers/__pycache__/main.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e6d673f6560aaeda413e262e9aad5e43a795ca96
Binary files /dev/null and b/vers/__pycache__/main.cpython-312.pyc differ
diff --git a/vers/__pycache__/vers.cpython-312.pyc b/vers/__pycache__/vers.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2cc3058d2e107ec4f48ef18731ab43366e2edff6
Binary files /dev/null and b/vers/__pycache__/vers.cpython-312.pyc differ
diff --git a/vers/__pycache__/vers_api.cpython-312.pyc b/vers/__pycache__/vers_api.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7549a08005ec81933ed721ef59fd02755142ca09
Binary files /dev/null and b/vers/__pycache__/vers_api.cpython-312.pyc differ
diff --git a/vers/compute_vers_score.py b/vers/compute_vers_score.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4c8a14e3dab0e0c6dd5121796ef430b7db071eb
--- /dev/null
+++ b/vers/compute_vers_score.py
@@ -0,0 +1,85 @@
+from  .vers import calc_vers 
+import librosa
+import numpy as np
+import math
+from .filler_analyzer import detect_fillers
+from .find_valence import  get_valence_score
+
+def compute_vers_score(file_path: str, whisper_model) -> dict:
+    """
+    Compute VERS (Vocal Emotional Regulation Score) and its components from a speech sample.
+    """
+    result = whisper_model.transcribe(file_path)
+    transcript = result.get("text", "").strip()
+    segments = result.get("segments", [])
+
+ 
+
+    # Filler count
+    filler_count, _ = detect_fillers(transcript)
+
+    # Load audio
+    y, sr = librosa.load(file_path, sr=None)
+    duration = len(y) / sr if sr else 0.0
+
+    # Volume (RMS)
+    rms = librosa.feature.rms(y=y)[0]
+    mean_rms = float(np.mean(rms))
+    mean_volume_db = 20 * math.log10(mean_rms + 1e-6) if mean_rms > 0 else -80.0
+    volume_std = np.std(20 * np.log10(rms + 1e-6))
+
+    # Max volume
+    vol_max = np.max(np.abs(y)) if y.size > 0 else 0.0
+    vol_max_db = 20 * math.log10(vol_max + 1e-6) if vol_max > 0 else -80.0
+
+    # Pitch variation
+    f0, voiced_flags, voiced_probs = librosa.pyin(
+        y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan)
+    voiced_f0 = f0[~np.isnan(f0)]
+    pitch_variation = 0.0
+    if voiced_f0.size > 0:
+        median_f0 = np.nanmedian(voiced_f0)
+        median_f0 = max(median_f0, 1e-6)
+        semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
+        pitch_variation = float(np.nanstd(semitone_diffs))
+
+    # Pause analysis
+    total_speaking_time = 0.0
+    long_pause_count = 0
+    if segments:
+        for seg in segments:
+            total_speaking_time += (seg["end"] - seg["start"])
+        for i in range(len(segments) - 1):
+            pause_dur = segments[i+1]["start"] - segments[i]["end"]
+            if pause_dur > 1.0:
+                long_pause_count += 1
+        first_start = segments[0]["start"]
+        last_end = segments[-1]["end"]
+        if first_start > 1.0:
+            long_pause_count += 1
+        if duration - last_end > 1.0:
+            long_pause_count += 1
+
+    # WPM
+    words = transcript.split()
+    word_count = len(words)
+    words_per_min = (word_count / duration) * 60.0 if duration > 0 else 0.0
+
+    
+    valence_scores = get_valence_score(file_path)
+
+    # Calculate VERS
+    vers_result = calc_vers(
+        filler_count=filler_count,
+        long_pause_count=long_pause_count,
+        pitch_variation=pitch_variation,
+        mean_volume_db=mean_volume_db,
+        vol_max_db=vol_max_db,
+        wpm=words_per_min,
+        volume_std=volume_std,
+        valence_scores=valence_scores
+    )
+
+    # Include transcript optionally
+    vers_result["transcript"] = transcript
+    return vers_result
diff --git a/vers/filler_analyzer.py b/vers/filler_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9db9c83861b928c0663b0b00655c06e5c5484f12
--- /dev/null
+++ b/vers/filler_analyzer.py
@@ -0,0 +1,101 @@
+# Define filler words for English, Hindi, Tamil (in both Latin and native scripts)
+# Mapping each variant to a common label (usually the Latin script for insight reporting)
+FILLER_VARIANTS = {
+    # English fillers
+    "um": "um", "uh": "uh", "hmm": "hmm", "ah": "ah", "er": "er",
+    "umm": "um", "uhh": "uh", "mmm": "hmm",
+    "like": "like", "you know": "you know", "so": "so", "well": "well",
+    # Hindi fillers (Devanagari and transliteration)
+    "मतलब": "matlab", "matlab": "matlab",
+    "क्या कहते हैं": "kya kehte hain", "kya kehte hain": "kya kehte hain",
+    "वो ना": "wo na", "woh na": "wo na", "wo na": "wo na",
+    "ऐसा है": "aisa hai", "aisa hai": "aisa hai",
+    "हाँ": "haan", "haan": "haan", "हा": "haan",  # "हा" might appear as a shorter "haan"
+    "अच्छा": "acha", "acha": "acha",
+    # Tamil fillers (Tamil script and transliteration)
+    "பாத்தீங்கனா": "paatheenga-na", "paatheenga na": "paatheenga-na", "paatheenga-na": "paatheenga-na",
+    "அப்பரம்": "apparam", "apparam": "apparam",
+    "என்ன": "enna", "enna": "enna"
+}
+
+def detect_fillers(transcript):
+    """
+    Detects filler words in the transcript.
+    
+    Args:
+        transcript: Full transcript text
+        
+    Returns:
+        tuple: (filler_count, filler_occurrences)
+    """
+    transcript_lower = transcript.lower()
+    filler_count = 0
+    # Track which specific fillers were used (for insight examples)
+    filler_occurrences = {}
+    
+    for variant, label in FILLER_VARIANTS.items():
+        if variant in transcript_lower:
+            count = transcript_lower.count(variant)
+            if count > 0:
+                filler_count += count
+                # Accumulate count for the normalized label
+                filler_occurrences[label] = filler_occurrences.get(label, 0) + count
+    
+    return filler_count, filler_occurrences
+
+
+def analyze_filler_words(filler_count, filler_occurrences, duration):
+    """
+    Analyzes filler word usage in speech.
+    
+    Args:
+        filler_count: Total count of filler words
+        filler_occurrences: Dictionary of specific filler words and their counts
+        duration: Duration of the audio in seconds
+        
+    Returns:
+        dict: Contains the filler words score and insight text
+    """
+    # Extract top examples for insights
+    filler_examples = []
+    if filler_occurrences:
+        # Sort by frequency
+        sorted_fillers = sorted(filler_occurrences.items(), key=lambda x: x[1], reverse=True)
+        for label, count in sorted_fillers[:2]:
+            filler_examples.append(label)
+    
+    # Compute fillers per minute as a gauge
+    filler_per_min = (filler_count / duration) * 60.0 if duration > 0 else 0.0
+    
+    if filler_count == 0:
+        filler_score = 10
+    elif filler_per_min < 1:
+        filler_score = 9
+    elif filler_per_min < 3:
+        filler_score = 8
+    elif filler_per_min < 5:
+        filler_score = 6
+    elif filler_per_min < 10:
+        filler_score = 4
+    else:
+        filler_score = 2
+        
+    filler_score = max(0, filler_score)
+    
+    # Generate insight text based on the score and examples
+    if filler_count == 0:
+        insight = "No filler words (um, ah, etc.) were detected, keeping the speech very clear."
+    elif filler_count <= 2:
+        example = filler_examples[0] if filler_examples else "um"
+        insight = f"Only a couple of filler words (e.g., '{example}') were used, which had minimal impact."
+    elif filler_count <= 5:
+        examples = ", ".join(f"'{ex}'" for ex in filler_examples) if filler_examples else "filler words"
+        insight = f"Some filler words {examples} were used occasionally; reducing them could improve clarity."
+    else:
+        examples = ", ".join(f"'{ex}'" for ex in filler_examples) if filler_examples else "'um'"
+        insight = f"Frequent filler words such as {examples} were detected, which can distract the audience and suggest uncertainty."
+    
+    return {
+        "score": int(filler_score),
+        "insight": insight
+    }
\ No newline at end of file
diff --git a/vers/find_valence.py b/vers/find_valence.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d34558dc2063857d89cd8007717bf8908fd1c86
--- /dev/null
+++ b/vers/find_valence.py
@@ -0,0 +1,100 @@
+# from transformers.models.wav2vec2 import Wav2Vec2Model, Wav2Vec2FeatureExtractor
+# import torchaudio
+# import torch
+# import torch.nn as nn
+
+
+
+def get_valence_score(file_path):
+    # class VADPredictor(nn.Module):
+    #     """Model to predict VAD Scores"""
+    #     def __init__(self, pretrained_model_name="facebook/wav2vec2-base-960h", freeze_feature_extractor=True):
+    #         super(VADPredictor, self).__init__()
+
+    #         self.wav2vec2 = Wav2Vec2Model.from_pretrained(pretrained_model_name)
+
+    #         if freeze_feature_extractor:
+    #             for param in self.wav2vec2.feature_extractor.parameters():
+    #                 param.requires_grad = False
+
+    #         hidden_size = self.wav2vec2.config.hidden_size
+
+    #         self.valence_layers = nn.Sequential(
+    #             nn.Linear(hidden_size, 256),
+    #             nn.ReLU(),
+    #             nn.Dropout(0.3),
+    #             nn.Linear(256,64),
+    #             nn.Linear(64,1)
+    #         )
+    #         self.arousal_layers = nn.Sequential(
+    #             nn.Linear(hidden_size, 256),
+    #             nn.ReLU(),
+    #             nn.Dropout(0.3),
+    #             nn.Linear(256,64),
+    #             nn.Linear(64,1)
+    #         )
+    #         self.dominance_layers = nn.Sequential(
+    #             nn.Linear(hidden_size, 256),
+    #             nn.ReLU(),
+    #             nn.Dropout(0.3),
+    #             nn.Linear(256,64),
+    #             nn.Linear(64,1)
+    #         )
+
+    #     def forward(self, input_values, attention_mask=None):
+    #         outputs = self.wav2vec2(input_values, attention_mask=attention_mask)
+    #         last_hidden_state = outputs.last_hidden_state
+    #         pooled_output = torch.mean(last_hidden_state, dim=1)
+
+    #         valence = self.valence_layers(pooled_output)
+    #         arousal = self.arousal_layers(pooled_output)
+    #         dominance = self.dominance_layers(pooled_output)
+
+    #         return {
+    #             'valence': valence.squeeze(-1),
+    #             'arousal': arousal.squeeze(-1),
+    #             'dominance': dominance.squeeze(-1)
+    #         }
+
+
+    # model = VADPredictor()
+    # model.load_state_dict(torch.load(r"D:\Intern\shankh\DUMP\vad_predictor_model.pt", map_location=torch.device("cpu")))
+    # model.eval()
+
+    # feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
+
+    # # Load and process audio
+    # file_path = file_path
+    # waveform, sr = torchaudio.load(file_path)
+
+    # # Convert to mono
+    # if waveform.shape[0] > 1:
+    #     waveform = waveform.mean(dim=0, keepdim=True)
+
+    # # Resample to 16000 Hz
+    # if sr != 16000:
+    #     resampler = torchaudio.transforms.Resample(sr, 16000)
+    #     waveform = resampler(waveform)
+    #     sr = 16000
+
+    # # Normalize
+    # waveform = waveform / waveform.abs().max()
+
+    # # Parameters
+    # segment_sec = 1
+    # segment_samples = int(segment_sec * sr)
+
+    # valence_scores = []
+
+    # # Inference per segment
+    # with torch.no_grad():
+    #     for start in range(0, waveform.shape[1] - segment_samples + 1, segment_samples):
+    #         segment = waveform[:, start:start+segment_samples]
+    #         input_values = feature_extractor(segment.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values
+    #         output = model(input_values)
+    #         val = output['valence'].item()
+    #         valence_scores.append(val)
+    valence_scores = 5.0
+    
+    return valence_scores
+
diff --git a/vers/main.py b/vers/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1c7749db582f8154dbc2904c2b85d84101676f3
--- /dev/null
+++ b/vers/main.py
@@ -0,0 +1,16 @@
+
+from .compute_vers_score import compute_vers_score
+import whisper
+
+
+
+whisper_model = whisper.load_model("base")
+
+test_result = compute_vers_score(r"D:\Intern\shankh\audio_samples\obama_short.wav", whisper_model)
+
+print("VERS Score:", test_result["VERS"])
+print("ESS:", test_result["ESS"])
+print("LCS:", test_result["LCS"])
+print("SRS:", test_result["SRS"])
+print("Insight:", test_result["insight"])
+print("Transcript:", test_result["transcript"])
diff --git a/vers/vers.py b/vers/vers.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a207a6abd9ea3b66e7d4f5f508ffda1060c808c
--- /dev/null
+++ b/vers/vers.py
@@ -0,0 +1,118 @@
+import numpy as np
+
+def calc_ess(pitch_variation, vol_max_db, mean_volume_db, valence_scores):
+    """
+    Emotional Stability Score(ESS) : Measures the consistency of the speaker's emotional tone, reflecting their ability to regulate emotions during speech.
+    
+    Requires:
+      Tonal Steadiness: The lack of extreme fluctuations in emotional tone.
+      Absence of Sudden Loudness Spikes: Indicates controlled expression without abrupt emotional shifts.
+      Valence Stability: Consistency in the overall positive or negative tone across the speech.
+    """
+    # calculate tonal steadiness
+    tonal_steadiness = max(0, 100 - (pitch_variation * 10))
+    
+    # calculate loudness spikes
+    spike = max(0, vol_max_db - mean_volume_db - 15)
+    spike_ratio = min(spike / 30, 1.0)  # Normalize with typical loudness range
+    stability = 1 - spike_ratio
+    loudness_stability = stability * 100
+    
+    # calculate valence stability
+    valence_stability = 100 - (np.std(valence_scores) * 20)
+    
+    ESS = (0.45 * float(tonal_steadiness)) + (0.35 * float(loudness_stability)) + (0.2 * float(valence_stability))
+    print(f" tonal_steadiness: {tonal_steadiness}, loudness_stability: {loudness_stability}, valence_stability: {valence_stability}")
+    return ESS
+
+def calc_lcs(volume_std, vol_max_db, mean_volume_db):
+    """
+    Loudness Control Score (LCS): Evaluates how well the speaker manages volume
+    
+    Requires:
+        - Volume Stability: Consistency in speech amplitude.
+        - Controlled Emphasis: The ability to modulate loudness smoothly for emphasis rather than abrupt changes.
+    """
+    vol_stability = max(0, 100 - (volume_std * 5))  # Scale std for speech (5 dB std = 75)
+    
+    # Controlled Emphasis (45%)
+    emphasis_spike = max(0, vol_max_db - mean_volume_db - 3)  
+    spike_ratio = min(emphasis_spike / 15, 1.0)  # Normalize to 15 dB range
+    emphasis_control = (1 - spike_ratio) * 100
+    
+    # Combine scores
+    lcs = 0.55 * vol_stability + 0.45 * emphasis_control
+    print(f"vol_stability: {vol_stability}, emphasis_control: {emphasis_control}")
+    return min(100, max(0, lcs))
+
+def calc_srs(wpm, filler_count, long_pause_count, pitch_variation):
+    """
+    Speech Rate Stability (SRS): Reflects the consistency of the speaker's pace and rhythm.
+    
+    Requires:
+        - Words per Minute Consistency: Regularity in speech speed.
+        - Absence of Sudden Speed Shifts: Smooth transitions without erratic tempo changes.
+    """
+    ideal_wpm = 150
+    wpm_deviation = min(30, abs(wpm - ideal_wpm))  # Cap at 30 WPM deviation
+    wpm_consistency = max(0, 100 - (wpm_deviation * 1.67))  # 100-50 for max deviation
+
+    # Sudden Speech Shift Penalty
+    filler_penalty = min(filler_count / 10, 1.0)
+    pause_penalty = min(long_pause_count / 5, 1.0)
+    pitch_penalty = min(pitch_variation / 3.0, 1.0)  # High variation → unstable
+
+    # Combine into absence of sudden shifts
+    stability = (1 - ((filler_penalty + pause_penalty + pitch_penalty) / 3)) * 100
+
+    # Final SRS Score
+    SRS = (0.45 * wpm_consistency) + (0.55 * stability)
+    print(f"wpm_consistency: {wpm_consistency}, stability: {stability}")
+    return min(100, max(0, SRS))
+
+def calc_vers(filler_count, long_pause_count, pitch_variation, mean_volume_db, vol_max_db, wpm, volume_std, valence_scores):
+    ESS = calc_ess(pitch_variation, vol_max_db, mean_volume_db, valence_scores)
+    LCS = calc_lcs(volume_std, vol_max_db, mean_volume_db)
+    SRS = calc_srs(wpm, filler_count, long_pause_count, pitch_variation)
+    
+    # Calculate the VERS score using the formula
+    VERS = (0.5 * ESS) + (0.3 * LCS) + (0.2 * SRS)  # This would be value from 0 to 100 
+    
+    if VERS > 0 and VERS < 50:
+        insight = """Poor regulation—noticeable swings in tone and uncontrolled
+                     emotional expression. Feedback: Consider exercises and professional 
+                     coaching to stabilize your emotional delivery."""
+    elif VERS >= 50 and VERS < 80:
+        insight = """Moderate regulation—occasional fluctuations or abrupt changes. 
+                     Feedback: Work on smoothing out volume changes and maintaining a steady tone."""
+    elif VERS >= 80 and VERS <= 100:
+        insight = """Excellent regulation—steady tone and controlled volume dynamics. 
+                     Feedback: Continue using techniques that maintain emotional balance."""
+    else:
+        insight = "Invalid score calculated"
+    
+    return {
+        "VERS": int(VERS),
+        "ESS": round(ESS, 1),
+        "LCS": round(LCS, 1),
+        "SRS": round(SRS, 1),
+        "insight": insight
+    }
+
+# # Test input
+# test_result = calc_vers(
+#     filler_count=4,
+#     long_pause_count=2,
+#     pitch_variation=3.2,
+#     mean_volume_db=65,
+#     vol_max_db=82,
+#     wpm=148,
+#     volume_std=4.1,
+#     valence_scores=[5.2, 5.5, 4.9]
+# )
+
+# print("VERS Score:", test_result["VERS"])
+# print("ESS:", test_result["ESS"])
+# print("LCS:", test_result["LCS"])
+# print("SRS:", test_result["SRS"])
+# print("Insight:", test_result["insight"])
\ No newline at end of file
diff --git a/vers/vers_api.py b/vers/vers_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..304efe045939d0ea8b94d2b8b290fb23504d124c
--- /dev/null
+++ b/vers/vers_api.py
@@ -0,0 +1,44 @@
+import whisper
+import numpy as np
+from .compute_vers_score import compute_vers_score
+
+def convert_numpy_types(obj):
+    """Convert NumPy types to Python native types for JSON serialization."""
+    if isinstance(obj, np.integer):
+        return int(obj)
+    elif isinstance(obj, np.floating):
+        return float(obj)
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    elif isinstance(obj, dict):
+        return {k: convert_numpy_types(v) for k, v in obj.items()}
+    elif isinstance(obj, list):
+        return [convert_numpy_types(i) for i in obj]
+    else:
+        return obj
+
+def main(file_path: str, model_size: str = "base") -> dict:
+    try:
+        # Load whisper model
+        whisper_model = whisper.load_model(model_size)
+        
+        # Compute VERS score
+        results = compute_vers_score(file_path, whisper_model)
+        
+        # Convert any NumPy types to native Python types
+        results = convert_numpy_types(results)
+
+        # Structure response with rounded values
+        # (using Python's built-in round function which returns Python native float)
+        response = {
+            "VERS Score": round(results['VERS'], 2)
+            # "ESS": round(results['ESS'], 2),
+            # "LCS": round(results['LCS'], 2),
+            # "SRS": round(results['SRS'], 2),
+            # "Insight": results['insight'],
+        }
+        
+        return response
+
+    except Exception as e:
+        raise RuntimeError(f"Error during analysis: {str(e)}")
\ No newline at end of file
diff --git a/ves/__init__.py b/ves/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ves/__pycache__/__init__.cpython-312.pyc b/ves/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8f56a2146d0c8633d32e5630212a5f9a290530ca
Binary files /dev/null and b/ves/__pycache__/__init__.cpython-312.pyc differ
diff --git a/ves/__pycache__/ves.cpython-312.pyc b/ves/__pycache__/ves.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee24dd581cb595943ae3065b909db3486d233944
Binary files /dev/null and b/ves/__pycache__/ves.cpython-312.pyc differ
diff --git a/ves/ves.py b/ves/ves.py
new file mode 100644
index 0000000000000000000000000000000000000000..db7b35e07adda37d45758ff133b275d09bed5265
--- /dev/null
+++ b/ves/ves.py
@@ -0,0 +1,26 @@
+# voice engagement score = 0.4 * valence + 0.3 * arousal + 0.3 * SDS
+from tone_modulation.sds import calc_sds
+
+def get_valence_and_arousal(file_path):
+    
+    valence = 4.5 #we get this from model
+
+    arousal = 3.2 #we get this from model
+    
+    return valence, arousal 
+
+
+
+def calc_voice_engagement_score(file_path):
+    valence, arousal = get_valence_and_arousal(file_path)
+    
+    # Calculate SDS
+
+    sds = calc_sds(file_path) 
+
+    ves = 0.4 * valence + 0.3 * arousal + 0.3 * sds
+    
+    return {
+        # "sds": sds,
+        "ves": ves
+    }
\ No newline at end of file
diff --git a/voice_confidence_score/__init__.py b/voice_confidence_score/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/voice_confidence_score/__pycache__/__init__.cpython-312.pyc b/voice_confidence_score/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ecc05c5be8cc0abc43a90fa6f090322bafd17a45
Binary files /dev/null and b/voice_confidence_score/__pycache__/__init__.cpython-312.pyc differ
diff --git a/voice_confidence_score/__pycache__/main.cpython-312.pyc b/voice_confidence_score/__pycache__/main.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..970cd1c3c08430183e19a5ca90fc447092215180
Binary files /dev/null and b/voice_confidence_score/__pycache__/main.cpython-312.pyc differ
diff --git a/voice_confidence_score/__pycache__/voice_confidence.cpython-312.pyc b/voice_confidence_score/__pycache__/voice_confidence.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0be2d91ea85d9c610e762a8ffcf03c247ee2acc7
Binary files /dev/null and b/voice_confidence_score/__pycache__/voice_confidence.cpython-312.pyc differ
diff --git a/voice_confidence_score/__pycache__/voice_confidence_api.cpython-312.pyc b/voice_confidence_score/__pycache__/voice_confidence_api.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1302c831071e11829e8faf4d6cbdae24a6caea11
Binary files /dev/null and b/voice_confidence_score/__pycache__/voice_confidence_api.cpython-312.pyc differ
diff --git a/voice_confidence_score/main.py b/voice_confidence_score/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e582c228c96d85bbc01d5b24e4125a54d72b46c
--- /dev/null
+++ b/voice_confidence_score/main.py
@@ -0,0 +1,11 @@
+from .voice_confidence import calc_voice_confidence_score
+import whisper
+
+model_size = "base"  
+whisper_model = whisper.load_model(model_size)
+
+audio_file = r"D:\Intern\shankh\audio_samples\obama_short.wav"
+
+result = calc_voice_confidence_score(audio_file, whisper_model) 
+
+print(f"Voice Confidence Score: {result:.2f}")
\ No newline at end of file
diff --git a/voice_confidence_score/voice_confidence.py b/voice_confidence_score/voice_confidence.py
new file mode 100644
index 0000000000000000000000000000000000000000..1184206d76c61806d083f7cd8284064f691fd67b
--- /dev/null
+++ b/voice_confidence_score/voice_confidence.py
@@ -0,0 +1,38 @@
+# voice confidence score = 0.4 * dominance + 0.3 * scs + 0.3 * fluency.
+
+import whisper 
+from fluency.compute_fluency import compute_fluency_score
+from vcs.compute_vcs import analyze_voice_quality
+
+
+def calc_fluency_score(audio_path, whisper_model):
+
+ # Calculate fluency score
+    print(f"Analyzing fluency for {audio_path}...")
+    results = compute_fluency_score(audio_path, whisper_model)
+    fluency_score = results['fluency_score']
+    
+    return fluency_score 
+
+def calc_vcs(audio_path, whisper_model):
+
+
+    # Calculate voice clarity score
+    print(f"Analyzing voice clarity for {audio_path}...")
+    results = analyze_voice_quality(audio_path, whisper_model)
+    vcs = results['VCS']
+    
+    return vcs
+
+dominance = 5.6 # dummy for now i add later
+
+def calc_voice_confidence_score(audio_path, model):
+
+    fluency_score = calc_fluency_score(audio_path, model)
+    vcs = calc_vcs(audio_path, model)
+
+    # Calculate voice confidence score
+    voice_confidence_score = 0.4 * dominance + 0.3 * vcs + 0.3 * fluency_score
+
+    return voice_confidence_score
+
diff --git a/voice_confidence_score/voice_confidence_api.py b/voice_confidence_score/voice_confidence_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd6b3ec671f803678b22e08e7212cf52b26d5c51
--- /dev/null
+++ b/voice_confidence_score/voice_confidence_api.py
@@ -0,0 +1,16 @@
+import whisper
+from .voice_confidence import calc_voice_confidence_score
+
+def main(file_path: str, model_size: str = "base") -> dict:
+    try:
+        # Load the Whisper model
+        whisper_model = whisper.load_model(model_size)
+
+        # Calculate the voice confidence score
+        result = calc_voice_confidence_score(file_path, whisper_model)
+
+        # Return the result as a dictionary
+        return {"voice_confidence_score": round(result, 2)}
+    except Exception as e:
+        return {"error": str(e)}
+
diff --git a/vps/__init__.py b/vps/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/vps/__pycache__/__init__.cpython-312.pyc b/vps/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0a9b97e335aeea6df957c3d11c3ccbdda90743e9
Binary files /dev/null and b/vps/__pycache__/__init__.cpython-312.pyc differ
diff --git a/vps/__pycache__/compute_vps_score.cpython-312.pyc b/vps/__pycache__/compute_vps_score.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1533ad1a4897db15fbbcb581e7bedfeb58acab46
Binary files /dev/null and b/vps/__pycache__/compute_vps_score.cpython-312.pyc differ
diff --git a/vps/__pycache__/filler_analyzer.cpython-312.pyc b/vps/__pycache__/filler_analyzer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1cd5b97993b072eaba33916fc9b9e5f2fbc4ef78
Binary files /dev/null and b/vps/__pycache__/filler_analyzer.cpython-312.pyc differ
diff --git a/vps/__pycache__/main.cpython-312.pyc b/vps/__pycache__/main.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..334d491c8a9c4ab50a62037bfad6e49ddf9ed0b1
Binary files /dev/null and b/vps/__pycache__/main.cpython-312.pyc differ
diff --git a/vps/__pycache__/vps.cpython-312.pyc b/vps/__pycache__/vps.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0bab3f8d9d7496bb7118f2cdd9af775b92e2807
Binary files /dev/null and b/vps/__pycache__/vps.cpython-312.pyc differ
diff --git a/vps/__pycache__/vps_api.cpython-312.pyc b/vps/__pycache__/vps_api.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f185440db6b795894a63f7143bdcfc0ada12bb80
Binary files /dev/null and b/vps/__pycache__/vps_api.cpython-312.pyc differ
diff --git a/vps/compute_vps_score.py b/vps/compute_vps_score.py
new file mode 100644
index 0000000000000000000000000000000000000000..883bb25ca20ed7edea0f442250c172971a65e5dc
--- /dev/null
+++ b/vps/compute_vps_score.py
@@ -0,0 +1,79 @@
+from .vps import calculate_vps  # Your file where calc_srs, calculate_pas, calculate_rcs, calculate_vps live
+import librosa
+import numpy as np
+import math
+from .filler_analyzer import detect_fillers
+
+def compute_vps_score(file_path: str, whisper_model) -> dict:
+    """
+    Compute VPS (Voice Pacing Score) and its components from a speech sample.
+
+    Args:
+        file_path (str): Path to the audio file.
+        whisper_model: Transcription model (e.g., OpenAI Whisper or faster-whisper)
+
+    Returns:
+        dict: A dictionary containing VPS, SRS, PAS, RCS, and component scores.
+    """
+    # Transcribe
+    result = whisper_model.transcribe(file_path)
+    transcript = result.get("text", "").strip()
+    segments = result.get("segments", [])
+
+    # Validate early
+    if not transcript or not segments:
+        raise ValueError("Empty transcript or segments from Whisper.")
+
+    # Filler count
+    filler_count, _ = detect_fillers(transcript)
+
+    # Load audio
+    y, sr = librosa.load(file_path, sr=None)
+    duration = len(y) / sr if sr else 0.0
+    if duration <= 0:
+        raise ValueError("Audio duration invalid or zero.")
+
+    # Pitch variation (in semitones)
+    f0, voiced_flags, voiced_probs = librosa.pyin(
+        y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan)
+    voiced_f0 = f0[~np.isnan(f0)]
+    pitch_variation = 0.0
+    if voiced_f0.size > 0:
+        median_f0 = np.nanmedian(voiced_f0)
+        median_f0 = max(median_f0, 1e-6)
+        semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
+        pitch_variation = float(np.nanstd(semitone_diffs))
+
+    # Pause analysis
+    long_pause_count = 0
+    if segments:
+        for i in range(len(segments) - 1):
+            pause_dur = segments[i + 1]["start"] - segments[i]["end"]
+            if pause_dur > 1.0:
+                long_pause_count += 1
+        # Beginning and end
+        if segments[0]["start"] > 1.0:
+            long_pause_count += 1
+        if duration - segments[-1]["end"] > 1.0:
+            long_pause_count += 1
+
+    # WPM
+    word_count = len(transcript.split())
+    words_per_min = (word_count / duration) * 60.0 if duration > 0 else 0.0
+
+    # Calculate VPS and components
+    vps_result = calculate_vps(
+        transcript=transcript,
+        segments=segments,
+        filler_count=filler_count,
+        duration=duration,
+        wpm=words_per_min,
+        long_pause_count=long_pause_count,
+        pitch_variation=pitch_variation,
+        y=y,
+        sr=sr
+    )
+
+    # Include transcript optionally
+    vps_result["transcript"] = transcript
+    return vps_result
diff --git a/vps/filler_analyzer.py b/vps/filler_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..090853dcbea679429376dbc44ace524e671baf61
--- /dev/null
+++ b/vps/filler_analyzer.py
@@ -0,0 +1,100 @@
+# Define filler words for English, Hindi, Tamil (in both Latin and native scripts)
+# Mapping each variant to a common label (usually the Latin script for insight reporting)
+FILLER_VARIANTS = {
+    # English fillers
+    "um": "um", "uh": "uh", "hmm": "hmm", "ah": "ah", "er": "er",
+    "umm": "um", "uhh": "uh", "mmm": "hmm",
+    "like": "like", "you know": "you know", "so": "so", "well": "well",
+    # Hindi fillers (Devanagari and transliteration)
+    "मतलब": "matlab", "matlab": "matlab",
+    "क्या कहते हैं": "kya kehte hain", "kya kehte hain": "kya kehte hain",
+    "वो ना": "wo na", "woh na": "wo na", "wo na": "wo na",
+    "ऐसा है": "aisa hai", "aisa hai": "aisa hai",
+    "हाँ": "haan", "haan": "haan", "हा": "haan",  # "हा" might appear as a shorter "haan"
+    "अच्छा": "acha", "acha": "acha",
+    # Tamil fillers (Tamil script and transliteration)
+    "பாத்தீங்கனா": "paatheenga-na", "paatheenga na": "paatheenga-na", "paatheenga-na": "paatheenga-na",
+    "அப்பரம்": "apparam", "apparam": "apparam",
+    "என்ன": "enna", "enna": "enna"
+}
+
+def detect_fillers(transcript):
+    """
+    Detects filler words in the transcript.
+    
+    Args:
+        transcript: Full transcript text
+        
+    Returns:
+        tuple: (filler_count, filler_occurrences)
+    """
+    transcript_lower = transcript.lower()
+    filler_count = 0
+    # Track which specific fillers were used (for insight examples)
+    filler_occurrences = {}
+    
+    for variant, label in FILLER_VARIANTS.items():
+        if variant in transcript_lower:
+            count = transcript_lower.count(variant)
+            if count > 0:
+                filler_count += count
+                # Accumulate count for the normalized label
+                filler_occurrences[label] = filler_occurrences.get(label, 0) + count
+    
+    return filler_count, filler_occurrences
+
+def analyze_filler_words(filler_count, filler_occurrences, duration):
+    """
+    Analyzes filler word usage in speech.
+    
+    Args:
+        filler_count: Total count of filler words
+        filler_occurrences: Dictionary of specific filler words and their counts
+        duration: Duration of the audio in seconds
+        
+    Returns:
+        dict: Contains the filler words score and insight text
+    """
+    # Extract top examples for insights
+    filler_examples = []
+    if filler_occurrences:
+        # Sort by frequency
+        sorted_fillers = sorted(filler_occurrences.items(), key=lambda x: x[1], reverse=True)
+        for label, count in sorted_fillers[:2]:
+            filler_examples.append(label)
+    
+    # Compute fillers per minute as a gauge
+    filler_per_min = (filler_count / duration) * 60.0 if duration > 0 else 0.0
+    
+    if filler_count == 0:
+        filler_score = 10
+    elif filler_per_min < 1:
+        filler_score = 9
+    elif filler_per_min < 3:
+        filler_score = 8
+    elif filler_per_min < 5:
+        filler_score = 6
+    elif filler_per_min < 10:
+        filler_score = 4
+    else:
+        filler_score = 2
+        
+    filler_score = max(0, filler_score)
+    
+    # Generate insight text based on the score and examples
+    if filler_count == 0:
+        insight = "No filler words (um, ah, etc.) were detected, keeping the speech very clear."
+    elif filler_count <= 2:
+        example = filler_examples[0] if filler_examples else "um"
+        insight = f"Only a couple of filler words (e.g., '{example}') were used, which had minimal impact."
+    elif filler_count <= 5:
+        examples = ", ".join(f"'{ex}'" for ex in filler_examples) if filler_examples else "filler words"
+        insight = f"Some filler words {examples} were used occasionally; reducing them could improve clarity."
+    else:
+        examples = ", ".join(f"'{ex}'" for ex in filler_examples) if filler_examples else "'um'"
+        insight = f"Frequent filler words such as {examples} were detected, which can distract the audience and suggest uncertainty."
+    
+    return {
+        "score": int(filler_score),
+        "insight": insight
+    }
\ No newline at end of file
diff --git a/vps/main.py b/vps/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1bd91d1e21c4f288c6c8ddd4a8231cb800d6a22
--- /dev/null
+++ b/vps/main.py
@@ -0,0 +1,35 @@
+import whisper
+from .compute_vps_score import compute_vps_score  # Ensure this path is correct
+
+def main():
+    # 🔧 Set your input audio file path here
+    audio_path = r"D:\Intern\shankh\audio_samples\obama_short.wav"
+
+    # 🔧 Choose Whisper model (tiny, base, small, medium, large)
+    model_size = "base"
+
+    print(f"Loading Whisper model: {model_size}")
+    whisper_model = whisper.load_model(model_size)
+
+    print(f"Analyzing audio: {audio_path}")
+    try:
+        vps_result = compute_vps_score(audio_path, whisper_model)
+        
+        print("\n--- Voice Pacing Score (VPS) ---")
+        print(f"VPS Score: {vps_result['VPS']:.2f}")
+        print(f"  - SRS (Speech Rate Stability): {vps_result['SRS']:.2f}")
+        print(f"  - PAS (Pause Appropriateness): {vps_result['PAS']:.2f}")
+        print(f"      - NPP: {vps_result['NPP']:.2f}")
+        print(f"      - AFW: {vps_result['AFW']:.2f}")
+        print(f"  - RCS (Rhythm Consistency): {vps_result['RCS']:.2f}")
+        print(f"      - STR: {vps_result['STR']:.2f}")
+        print(f"      - STW: {vps_result['STW']:.2f}")
+
+        print("\nTranscript:")
+        print(vps_result["transcript"])
+
+    except Exception as e:
+        print(f"[Error] {e}")
+
+if __name__ == "__main__":
+    main()
diff --git a/vps/vps.py b/vps/vps.py
new file mode 100644
index 0000000000000000000000000000000000000000..322f27181efb55ba0d0fa998ddc324703c33e7aa
--- /dev/null
+++ b/vps/vps.py
@@ -0,0 +1,185 @@
+from typing import List, Dict
+import librosa
+import numpy as np
+import spacy
+import math
+from .filler_analyzer import detect_fillers
+
+def calc_srs(wpm: float, filler_count: int, long_pause_count: int, pitch_variation: float) -> float:
+    """
+    Speech Rate Stability (SRS): Reflects the consistency of the speaker's pace and rhythm.
+    """
+    ideal_wpm = 150
+    wpm_deviation = min(30, abs(wpm - ideal_wpm))
+    wpm_consistency = max(0, 100 - (wpm_deviation * 1.67))
+
+    filler_penalty = min(filler_count / 10, 1.0)
+    pause_penalty = min(long_pause_count / 5, 1.0)
+    pitch_penalty = min(pitch_variation / 3.0, 1.0)
+
+    stability = (1 - ((filler_penalty + pause_penalty + pitch_penalty) / 3)) * 100
+    SRS = (0.45 * wpm_consistency) + (0.55 * stability)
+    return min(100, max(0, SRS))
+
+def calculate_pas(transcript: str, segments: List[Dict], filler_count: int, duration: float) -> Dict[str, float]:
+    """
+    Calculate the Pause Appropriateness Score (PAS) and its components.
+    """
+    if not transcript or not segments or duration <= 0:
+        raise ValueError("Transcript, segments, and duration must be valid")
+    
+    nlp = spacy.load("en_core_web_sm")
+    doc = nlp(transcript)
+    
+    words = transcript.split()
+    total_words = len(words)
+    if total_words == 0:
+        raise ValueError("No words found in transcript")
+    
+    filler_rate = filler_count / total_words if total_words > 0 else 0.0
+    if filler_rate >= 0.10:
+        afw = 0.0
+    elif filler_rate <= 0.0:
+        afw = 100.0
+    else:
+        afw = 100.0 - (filler_rate * 1000)
+    afw = max(0.0, min(100.0, afw))
+    
+    total_pauses = 0
+    natural_pauses = 0
+    segment_texts = [seg["text"].strip() for seg in segments]
+    segment_starts = [seg["start"] for seg in segments]
+    segment_ends = [seg["end"] for seg in segments]
+    
+    for i in range(len(segments) - 1):
+        pause_dur = segment_starts[i + 1] - segment_ends[i]
+        if pause_dur > 0.5:
+            total_pauses += 1
+            if segment_texts[i] and segment_texts[i][-1] in ".!?,": 
+                natural_pauses += 1
+    
+    if segment_starts[0] > 0.5:
+        total_pauses += 1
+    if duration - segment_ends[-1] > 0.5:
+        total_pauses += 1
+        if segment_texts[-1] and segment_texts[-1][-1] in ".!?":
+            natural_pauses += 1
+    
+    npp = 100.0 if total_pauses == 0 else (natural_pauses / total_pauses) * 100.0
+    pas = (0.4 * npp) + (0.6 * afw)
+    
+    return {
+        "NPP": npp,
+        "AFW": afw,
+        "PAS": pas
+    }
+
+def calculate_rcs(y: np.ndarray, sr: int, segments: List[Dict], duration: float) -> Dict[str, float]:
+    """
+    Calculate the Rhythm Consistency Score (RCS) and its components.
+    """
+    if y.size == 0 or sr <= 0 or duration <= 0 or not segments:
+        raise ValueError("Audio signal, sampling rate, duration, and segments must be valid")
+    
+    onset_env = librosa.onset.onset_strength(y=y, sr=sr, hop_length=256)
+    onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time', hop_length=256)
+    
+    if len(onsets) > 1:
+        iois = np.diff(onsets)
+        ioi_std = np.std(iois)
+        ioi_std = min(max(ioi_std, 0.1), 0.5)
+        str_score = 100.0 * (0.5 - ioi_std) / (0.5 - 0.1)
+        str_score = max(0.0, min(100.0, str_score))
+    else:
+        str_score = 100.0
+    
+    total_transitions = 0
+    smooth_transitions = 0
+    pause_threshold = 0.3
+    
+    for i in range(len(segments) - 1):
+        gap = segments[i + 1]["start"] - segments[i]["end"]
+        total_transitions += 1
+        if gap <= pause_threshold:
+            smooth_transitions += 1
+    
+    for segment in segments:
+        words = segment["text"].strip().split()
+        if len(words) > 1:
+            smooth_transitions += len(words) - 1
+            total_transitions += len(words) - 1
+    
+    stw = 100.0 if total_transitions == 0 else (smooth_transitions / total_transitions) * 100.0
+    rcs = (0.5 * str_score) + (0.5 * stw)
+    
+    return {
+        "STR": str_score,
+        "STW": stw,
+        "RCS": rcs
+    }
+
+def calculate_vps(
+    transcript: str,
+    segments: List[Dict],
+    filler_count: int,
+    duration: float,
+    wpm: float,
+    long_pause_count: int,
+    pitch_variation: float,
+    y: np.ndarray,
+    sr: int
+) -> Dict[str, float]:
+    """
+    Calculate the Voice Pacing Score (VPS) and its components:
+    - SRS: Speech Rate Stability Score
+    - PAS: Pause Appropriateness Score
+    - RCS: Rhythm Consistency Score
+    - VPS = (0.5 * SRS) + (0.3 * PAS) + (0.2 * RCS)
+    
+    Args:
+        transcript (str): Transcribed text.
+        segments (List[Dict]): Whisper model segments with 'start', 'end', 'text'.
+        filler_count (int): Number of filler words.
+        duration (float): Audio duration (seconds).
+        wpm (float): Words per minute.
+        long_pause_count (int): Number of long pauses (>1.0s).
+        pitch_variation (float): Pitch variation in semitones.
+        y (np.ndarray): Audio signal.
+        sr (int): Sampling rate.
+    
+    Returns:
+        Dict[str, float]: Scores for SRS, PAS, RCS, VPS, and intermediates.
+    """
+    # Validate inputs
+    if not transcript or not segments or duration <= 0 or y.size == 0 or sr <= 0:
+        raise ValueError("Invalid inputs")
+    
+    # Calculate SRS
+    srs = calc_srs(wpm, filler_count, long_pause_count, pitch_variation)
+    
+    # Calculate PAS
+    pas_result = calculate_pas(transcript, segments, filler_count, duration)
+    pas = pas_result["PAS"]
+    npp = pas_result["NPP"]
+    afw = pas_result["AFW"]
+    
+    # Calculate RCS
+    rcs_result = calculate_rcs(y, sr, segments, duration)
+    rcs = rcs_result["RCS"]
+    str_score = rcs_result["STR"]
+    stw = rcs_result["STW"]
+    
+    # Calculate VPS
+    vps = (0.5 * srs) + (0.3 * pas) + (0.2 * rcs)
+    vps = max(0.0, min(100.0, vps))
+    
+    return {
+        "SRS": srs,
+        "PAS": pas,
+        "NPP": npp,
+        "AFW": afw,
+        "RCS": rcs,
+        "STR": str_score,
+        "STW": stw,
+        "VPS": vps
+    }
\ No newline at end of file
diff --git a/vps/vps_api.py b/vps/vps_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..42e10f2fa6ae093a470e619036f9e151bd67f351
--- /dev/null
+++ b/vps/vps_api.py
@@ -0,0 +1,25 @@
+import whisper
+from .compute_vps_score import compute_vps_score
+
+def main(file_path: str, model_size: str = "base") -> dict:
+    try:
+        # Load the Whisper model
+        whisper_model = whisper.load_model(model_size)
+
+        # Calculate the voice confidence score
+        result = compute_vps_score(file_path, whisper_model)
+
+        # Return the result as a dictionary
+        return {
+            "VPS": result["VPS"]
+            # "SRS": result["SRS"],
+            # "PAS": result["PAS"],
+            # "NPP": result["NPP"],
+            # "AFW": result["AFW"],
+            # "RCS": result["RCS"],
+            # "STR": result["STR"],
+            # "STW": result["STW"]
+        }
+    except Exception as e:
+        return {"error": str(e)}
+