Spaces:

cruvss
/

Fast_api

Sleeping

App Files Files Community

mulasagg commited on May 20

Commit

aef3b1e

1 Parent(s): e6cd41c

API optimizations

Browse files

Files changed (23) hide show

app.py +153 -329
filler_count/__pycache__/filler_score.cpython-312.pyc +0 -0
filler_count/filler_score.py +6 -5
fluency/__pycache__/compute_fluency.cpython-312.pyc +0 -0
fluency/__pycache__/fluency_api.cpython-312.pyc +0 -0
fluency/compute_fluency.py +28 -24
fluency/fluency_api.py +2 -2
vcs/__pycache__/compute_vcs.cpython-312.pyc +0 -0
vcs/compute_vcs.py +1 -47
vers/__pycache__/compute_vers_score.cpython-312.pyc +0 -0
vers/__pycache__/vers.cpython-312.pyc +0 -0
vers/__pycache__/vers_api.cpython-312.pyc +0 -0
vers/compute_vers_score.py +17 -9
vers/vers.py +6 -6
vers/vers_api.py +2 -2
voice_confidence_score/__pycache__/voice_confidence.cpython-312.pyc +0 -0
voice_confidence_score/__pycache__/voice_confidence_api.cpython-312.pyc +0 -0
voice_confidence_score/voice_confidence.py +7 -4
voice_confidence_score/voice_confidence_api.py +2 -2
vps/__pycache__/compute_vps_score.cpython-312.pyc +0 -0
vps/__pycache__/vps.cpython-312.pyc +0 -0
vps/__pycache__/vps_api.cpython-312.pyc +0 -0
vps/compute_vps_score.py +16 -11

app.py CHANGED Viewed

@@ -1,14 +1,15 @@
-from fastapi import FastAPI, UploadFile, File, Form , HTTPException
 from fastapi.responses import JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
-import sys
 import os
 import shutil
 import uuid
-# Ensure sibling module fluency is discoverable
-#sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 from fluency.fluency_api import main as analyze_fluency_main
 from tone_modulation.tone_api import main as analyze_tone_main
 from vcs.vcs_api import main as analyze_vcs_main
@@ -18,375 +19,198 @@ from vps.vps_api import main as analyze_vps_main
 from ves.ves import calc_voice_engagement_score
 from transcribe import transcribe_audio
 from filler_count.filler_score import analyze_fillers
-#from emotion.emo_predict import predict_emotion
 app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],  # In production, replace "*" with allowed frontend domains
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
-@app.post("/analyze_fluency/")
-async def analyze_fluency(file: UploadFile):
-    # idk if we can use pydantic model here If we need I can add later
-    if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
-        raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
-    # Generate a safe temporary file path for temporary storage of the uploaded file this will be deleted after processing
-    temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
-    temp_dir = "temp_uploads"
-    temp_filepath = os.path.join(temp_dir, temp_filename)
-    os.makedirs(temp_dir, exist_ok=True)
-    try:
-        # Save uploaded file
-        with open(temp_filepath, "wb") as buffer:
-            shutil.copyfileobj(file.file, buffer)
-        result = analyze_fluency_main(temp_filepath, model_size="base")
-        return JSONResponse(content=result)
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Fluency analysis failed: {str(e)}")
-    finally:
-        # Clean up temporary file
-        if os.path.exists(temp_filepath):
-            os.remove(temp_filepath)
-@app.post('/analyze_tone/')
-async def analyze_tone(file: UploadFile):
-    """
-    Endpoint to analyze tone of an uploaded audio file (.wav or .mp3).
-    """
-    if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
-        raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
-    # Generate a safe temporary file path
-    temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
     temp_dir = "temp_uploads"
-    temp_filepath = os.path.join(temp_dir, temp_filename)
     os.makedirs(temp_dir, exist_ok=True)
     try:
-        # Save uploaded file
         with open(temp_filepath, "wb") as buffer:
-            shutil.copyfileobj(file.file, buffer)
-        # Analyze tone using your custom function
-        result = analyze_tone_main(temp_filepath)
-        return JSONResponse(content=result)
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Tone analysis failed: {str(e)}")
     finally:
-        # Clean up temporary file
         if os.path.exists(temp_filepath):
             os.remove(temp_filepath)
-@app.post('/analyze_vcs/')
-async def analyze_vcs(file: UploadFile):
-    """
-    Endpoint to analyze voice clarity of an uploaded audio file (.wav or .mp3).
-    """
-    if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
-        raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
-    # Generate a safe temporary file path
-    temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
-    temp_dir = "temp_uploads"
-    temp_filepath = os.path.join(temp_dir, temp_filename)
-    os.makedirs(temp_dir, exist_ok=True)
-    try:
-        # Save uploaded file
-        with open(temp_filepath, "wb") as buffer:
-            shutil.copyfileobj(file.file, buffer)
-        # Analyze voice clarity using your custom function
-        result = analyze_vcs_main(temp_filepath)
-        return JSONResponse(content=result)
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Voice clarity analysis failed: {str(e)}")
-    finally:
-        # Clean up temporary file
-        if os.path.exists(temp_filepath):
-            os.remove(temp_filepath)
 @app.post('/analyze_vers/')
 async def analyze_vers(file: UploadFile):
-    """
-    Endpoint to analyze VERS of an uploaded audio file (.wav or .mp3).
-    """
-    if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
-        raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
-    # Generate a safe temporary file path
-    temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
-    temp_dir = "temp_uploads"
-    temp_filepath = os.path.join(temp_dir, temp_filename)
-    os.makedirs(temp_dir, exist_ok=True)
-    try:
-        # Save uploaded file
-        with open(temp_filepath, "wb") as buffer:
-            shutil.copyfileobj(file.file, buffer)
-        # Analyze VERS using your custom function
-        result = analyze_vers_main(temp_filepath)
-        return JSONResponse(content=result)
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"VERS analysis failed: {str(e)}")
-    finally:
-        # Clean up temporary file
-        if os.path.exists(temp_filepath):
-            os.remove(temp_filepath)
 @app.post('/voice_confidence/')
 async def analyze_voice_confidence(file: UploadFile):
-    """
-    Endpoint to analyze voice confidence of an uploaded audio file (.wav or .mp3).
-    """
-    if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
-        raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
-    # Generate a safe temporary file path
-    temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
-    temp_dir = "temp_uploads"
-    temp_filepath = os.path.join(temp_dir, temp_filename)
-    os.makedirs(temp_dir, exist_ok=True)
-    try:
-        # Save uploaded file
-        with open(temp_filepath, "wb") as buffer:
-            shutil.copyfileobj(file.file, buffer)
-        # Analyze voice confidence using your custom function
-        result = analyze_voice_confidence_main(temp_filepath)
-        return JSONResponse(content=result)
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Voice confidence analysis failed: {str(e)}")
-    finally:
-        # Clean up temporary file
-        if os.path.exists(temp_filepath):
-            os.remove(temp_filepath)
 @app.post('/analyze_vps/')
 async def analyze_vps(file: UploadFile):
-    """
-    Endpoint to analyze voice pacing score of an uploaded audio file (.wav or .mp3).
-    """
-    if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
-        raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
-    # Generate a safe temporary file path
-    temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
-    temp_dir = "temp_uploads"
-    temp_filepath = os.path.join(temp_dir, temp_filename)
-    os.makedirs(temp_dir, exist_ok=True)
-    try:
-        # Save uploaded file
-        with open(temp_filepath, "wb") as buffer:
-            shutil.copyfileobj(file.file, buffer)
-        # Analyze voice pacing score using your custom function
-        result = analyze_vps_main(temp_filepath)
-        return JSONResponse(content=result)
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Voice pacing score analysis failed: {str(e)}")
-    finally:
-        # Clean up temporary file
-        if os.path.exists(temp_filepath):
-            os.remove(temp_filepath)
 @app.post('/voice_engagement_score/')
 async def analyze_voice_engagement_score(file: UploadFile):
-    """
-    Endpoint to analyze voice engagement score of an uploaded audio file (.wav or .mp3).
-    """
-    if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
-        raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
-    # Generate a safe temporary file path
-    temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
-    temp_dir = "temp_uploads"
-    temp_filepath = os.path.join(temp_dir, temp_filename)
-    os.makedirs(temp_dir, exist_ok=True)
-    try:
-        # Save uploaded file
-        with open(temp_filepath, "wb") as buffer:
-            shutil.copyfileobj(file.file, buffer)
-        # Analyze voice engagement score using your custom function
-        result = calc_voice_engagement_score(temp_filepath)
-        return JSONResponse(content=result)
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Voice engagement score analysis failed: {str(e)}")
-    finally:
-        # Clean up temporary file
-        if os.path.exists(temp_filepath):
-            os.remove(temp_filepath)
 @app.post('/analyze_fillers/')
 async def analyze_fillers_count(file: UploadFile):
-    """
-    Endpoint to analyze filler words in an uploaded audio file (.wav or .mp3).
-    """
-    if not file.filename.endswith(('.wav', '.mp3','.mp4','.m4a','.flac')):
-        raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
-    # Generate a safe temporary file path
-    temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
-    temp_dir = "temp_uploads"
-    temp_filepath = os.path.join(temp_dir, temp_filename)
-    os.makedirs(temp_dir, exist_ok=True)
-    try:
-        # Save uploaded file
-        with open(temp_filepath, "wb") as buffer:
-            shutil.copyfileobj(file.file, buffer)
-        # Call the analysis function with the file path
-        result = analyze_fillers(temp_filepath)  # Pass the file path, not the UploadFile object
-        return JSONResponse(content=result)
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Filler analysis failed: {str(e)}")
-    finally:
-        # Clean up temporary file
-        if os.path.exists(temp_filepath):
-            os.remove(temp_filepath)
-import time
 @app.post('/transcribe/')
 async def transcribe(file: UploadFile):
-    """
-    Endpoint to transcribe an uploaded audio file ('.wav', '.mp3','mp4','.m4a','.flac' ).
-    """
-    #calculate time to transcribe
-    start_time = time.time()
-    if not file.filename.endswith(('.wav', '.mp3','mp4','.m4a','.flac')):
-        raise HTTPException(status_code=400, detail="Invalid file type. Only .wav ,mp4 and .mp3 files are supported.")
-    # Generate a safe temporary file path
-    temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
-    temp_dir = "temp_uploads"
-    temp_filepath = os.path.join(temp_dir, temp_filename)
-    os.makedirs(temp_dir, exist_ok=True)
-    try:
-        # Save uploaded file
-        with open(temp_filepath, "wb") as buffer:
-            shutil.copyfileobj(file.file, buffer)
-        # Transcribe using your custom function
-        result = transcribe_audio(temp_filepath,  model_size="base")
-        end_time = time.time()
-        transcription_time = end_time - start_time
-        response = {
-            "transcription": result,
-            "transcription_time": transcription_time
-        }
-        return JSONResponse(content=response)
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}")
-    finally:
-        # Clean up temporary file
-        if os.path.exists(temp_filepath):
-            os.remove(temp_filepath)
-import datetime
 @app.post('/analyze_all/')
 async def analyze_all(file: UploadFile):
-    """
-    Endpoint to analyze all aspects of an uploaded audio file (.wav or .mp3).
-    """
     print(f"Received request at {datetime.datetime.now()} for file: {file.filename}")
-    if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
-        raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
-    # Generate a safe temporary file path
-    temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
-    temp_dir = "temp_uploads"
-    temp_filepath = os.path.join(temp_dir, temp_filename)
-    os.makedirs(temp_dir, exist_ok=True)
-    try:
-        # Save uploaded file
-        with open(temp_filepath, "wb") as buffer:
-            shutil.copyfileobj(file.file, buffer)
-        # Analyze all aspects using your custom functions
-        fluency_result = analyze_fluency_main(temp_filepath, model_size="base")
-        tone_result = analyze_tone_main(temp_filepath)
-        vcs_result = analyze_vcs_main(temp_filepath)
-        vers_result = analyze_vers_main(temp_filepath)
-        voice_confidence_result = analyze_voice_confidence_main(temp_filepath)
-        vps_result = analyze_vps_main(temp_filepath)
-        ves_result = calc_voice_engagement_score(temp_filepath)
-        filler_count = analyze_fillers(temp_filepath)  # Assuming this function returns a dict with filler count
-        transcript, language, _ = transcribe_audio(temp_filepath, "base") #fix this
-        #emotion = predict_emotion(temp_filepath)
-        avg_score = (fluency_result['fluency_score'] + tone_result['speech_dynamism_score'] + vcs_result['Voice Clarity Sore'] + vers_result['VERS Score'] + voice_confidence_result['voice_confidence_score'] + vps_result['VPS'] + ves_result['ves']) / 7
-        # Combine results into a single response
-        combined_result = {
-            "fluency": fluency_result,
-            "tone": tone_result,
-            "vcs": vcs_result,
-            "vers": vers_result,
-            "voice_confidence": voice_confidence_result,
-            "vps": vps_result,
-            "ves": ves_result,
-            "filler_words": filler_count,
-            "transcript": transcript,
-            "Detected Language": language,
-            #"emotion": emotion ,
-            "sank_score": avg_score
-        }
-        return JSONResponse(content=combined_result)
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
-    finally:
-        # Clean up temporary file
-        if os.path.exists(temp_filepath):
-            os.remove(temp_filepath)

+from fastapi import FastAPI, UploadFile, File, HTTPException
 from fastapi.responses import JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
 import os
 import shutil
 import uuid
+import tempfile
+import datetime
+import time
+from contextlib import contextmanager
+# Import analysis functions (assumed to be modified to accept transcript)
 from fluency.fluency_api import main as analyze_fluency_main
 from tone_modulation.tone_api import main as analyze_tone_main
 from vcs.vcs_api import main as analyze_vcs_main
 from ves.ves import calc_voice_engagement_score
 from transcribe import transcribe_audio
 from filler_count.filler_score import analyze_fillers
+from emotion.emo_predict import predict_emotion
 app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"],  # Replace with specific domains in production
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
+ALLOWED_EXTENSIONS = {'.wav', '.mp3', '.m4a', '.mp4', '.flac'}
+@contextmanager
+def temp_file_handler(upload_file: UploadFile):
+    """Context manager to handle temporary file creation and cleanup."""
     temp_dir = "temp_uploads"
     os.makedirs(temp_dir, exist_ok=True)
+    temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(upload_file.filename)[1]}"
+    temp_filepath = os.path.join(temp_dir, temp_filename)
     try:
         with open(temp_filepath, "wb") as buffer:
+            shutil.copyfileobj(upload_file.file, buffer)
+        yield temp_filepath
     finally:
         if os.path.exists(temp_filepath):
             os.remove(temp_filepath)
+def validate_file_extension(filename: str):
+    """Validate if the file extension is allowed."""
+    if not os.path.splitext(filename)[1].lower() in ALLOWED_EXTENSIONS:
+        raise HTTPException(
+            status_code=400,
+            detail="Invalid file type. Only .wav, .mp3, .m4a, .mp4, and .flac files are supported."
+        )
+async def process_audio_file(upload_file: UploadFile, analysis_func, **kwargs):
+    """Generic function to process an audio file with a given analysis function."""
+    validate_file_extension(upload_file.filename)
+    with temp_file_handler(upload_file) as temp_filepath:
+        try:
+            result = analysis_func(temp_filepath, **kwargs)
+            return JSONResponse(content=result)
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
+@app.post("/analyze_fluency/")
+async def analyze_fluency(file: UploadFile):
+    return await process_audio_file(file, analyze_fluency_main, model_size="base")
+@app.post('/analyze_tone/')
+async def analyze_tone(file: UploadFile):
+    return await process_audio_file(file, analyze_tone_main)
+@app.post('/analyze_vcs/')
+async def analyze_vcs(file: UploadFile):
+    return await process_audio_file(file, analyze_vcs_main)
 @app.post('/analyze_vers/')
 async def analyze_vers(file: UploadFile):
+    return await process_audio_file(file, analyze_vers_main)
 @app.post('/voice_confidence/')
 async def analyze_voice_confidence(file: UploadFile):
+    return await process_audio_file(file, analyze_voice_confidence_main)
 @app.post('/analyze_vps/')
 async def analyze_vps(file: UploadFile):
+    return await process_audio_file(file, analyze_vps_main)
 @app.post('/voice_engagement_score/')
 async def analyze_voice_engagement_score(file: UploadFile):
+    return await process_audio_file(file, calc_voice_engagement_score)
 @app.post('/analyze_fillers/')
 async def analyze_fillers_count(file: UploadFile):
+    return await process_audio_file(file, analyze_fillers)
 @app.post('/transcribe/')
 async def transcribe(file: UploadFile):
+    validate_file_extension(file.filename)
+    start_time = time.time()
+    with temp_file_handler(file) as temp_filepath:
+        try:
+            transcript, language, _ = transcribe_audio(temp_filepath, model_size="base")
+            end_time = time.time()
+            response = {
+                "transcription": transcript,
+                "transcription_time": end_time - start_time,
+                "language": language
+            }
+            return JSONResponse(content=response)
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}")
 @app.post('/analyze_all/')
 async def analyze_all(file: UploadFile):
+    """Endpoint to analyze all aspects of an uploaded audio file with single transcription."""
     print(f"Received request at {datetime.datetime.now()} for file: {file.filename}")
+    validate_file_extension(file.filename)
+    with temp_file_handler(file) as temp_filepath:
+        try:
+            # Generate transcript once
+            transcript, language, _ = transcribe_audio(temp_filepath, model_size="base")
+            # Pass transcript to analysis functions that support it
+            analyze_all_start = time.time()
+            # Compute filler count
+            filler_start = time.time()
+            filler_count = analyze_fillers(temp_filepath)
+            filler_count_number = filler_count.get("total_fillers", 0)
+            filler_end = time.time()
+            print(f"Filler analysis time: {filler_end - filler_start} seconds")
+            fluency_start = time.time()
+            fluency_result = analyze_fluency_main(temp_filepath, model_size="base", filler_count = filler_count_number)
+            fluency_score = fluency_result['fluency_score']
+            fluency_end = time.time()
+            print(f"Fluency analysis time: {fluency_end - fluency_start} seconds")
+            tone_start = time.time()
+            tone_result = analyze_tone_main(temp_filepath)
+            tone_end = time.time()
+            print(f"Tone analysis time: {tone_end - tone_start} seconds")
+            vcs_start = time.time()
+            vcs_result = analyze_vcs_main(temp_filepath)
+            vcs_end = time.time()
+            print(f"VCS analysis time: {vcs_end - vcs_start} seconds")
+            vers_start = time.time()
+            vers_result = analyze_vers_main(temp_filepath, model_size="base", filler_count = filler_count_number)
+            vers_end = time.time()
+            print(f"VERS analysis time: {vers_end - vers_start} seconds")
+            voice_confidence_start = time.time()
+            voice_confidence_result = analyze_voice_confidence_main(temp_filepath, model_size="base", filler_count = filler_count_number, fluency_score = fluency_score)
+            print("voice_confidence_result:", voice_confidence_result)
+            voice_confidence_end = time.time()
+            print(f"Voice confidence analysis time: {voice_confidence_end - voice_confidence_start} seconds")
+            vps_start = time.time()
+            vps_result = analyze_vps_main(temp_filepath)
+            vps_end = time.time()
+            print(f"VPS analysis time: {vps_end - vps_start} seconds")
+            ves_start = time.time()
+            ves_result = calc_voice_engagement_score(temp_filepath)
+            ves_end = time.time()
+            print(f"VES analysis time: {ves_end - ves_start} seconds")
+            emotion_start = time.time()
+            emotion = predict_emotion(temp_filepath)
+            emotion_end = time.time()
+            print(f"Emotion analysis time: {emotion_end - emotion_start} seconds")
+            # Calculate average score
+            avg_score = (
+                fluency_result['fluency_score'] +
+                tone_result['speech_dynamism_score'] +
+                vcs_result['Voice Clarity Sore'] +
+                vers_result['VERS Score'] +
+                voice_confidence_result['voice_confidence_score'] +
+                vps_result['VPS'] +
+                ves_result['ves']
+            ) / 7
+            analyze_all_end = time.time()
+            # Combine results
+            combined_result = {
+                "fluency": fluency_result,
+                "tone": tone_result,
+                "vcs": vcs_result,
+                "vers": vers_result,
+                "voice_confidence": voice_confidence_result,
+                "vps": vps_result,
+                "ves": ves_result,
+                "filler_words": filler_count,
+                "transcript": transcript,
+                "Detected Language": language,
+                "emotion": emotion,
+                "sank_score": avg_score,
+                "analysis_time": analyze_all_end - analyze_all_start,
+            }
+            return JSONResponse(content=combined_result)
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")

filler_count/__pycache__/filler_score.cpython-312.pyc CHANGED Viewed

Binary files a/filler_count/__pycache__/filler_score.cpython-312.pyc and b/filler_count/__pycache__/filler_score.cpython-312.pyc differ

filler_count/filler_score.py CHANGED Viewed

@@ -2,7 +2,7 @@ import re
 import whisper
 from pydub import AudioSegment  # For accurate duration calculation
-def analyze_fillers(file_path: str, model_size: str = "base") -> dict:
     """
     Analyzes English filler words in audio with proper duration handling.
     """
@@ -18,10 +18,11 @@ def analyze_fillers(file_path: str, model_size: str = "base") -> dict:
         audio = AudioSegment.from_file(file_path)
         duration = len(audio) / 1000  # Convert ms to seconds
-        # Then run Whisper transcription
-        model = whisper.load_model(model_size)
-        result = model.transcribe(file_path, word_timestamps=False, fp16=False)
-        transcript = result["text"]
         # Case-insensitive regex matching
         pattern = r"(?<!\w)(" + "|".join(map(re.escape, FILLER_WORDS)) + r")(?!\w)"

 import whisper
 from pydub import AudioSegment  # For accurate duration calculation
+def analyze_fillers(file_path: str, model_size: str = "base", transcript =  None ) -> dict:
     """
     Analyzes English filler words in audio with proper duration handling.
     """
         audio = AudioSegment.from_file(file_path)
         duration = len(audio) / 1000  # Convert ms to seconds
+        if transcript is None:
+            # Then run Whisper transcription
+            model = whisper.load_model(model_size)
+            result = model.transcribe(file_path, word_timestamps=False, fp16=False)
+            transcript = result["text"]
         # Case-insensitive regex matching
         pattern = r"(?<!\w)(" + "|".join(map(re.escape, FILLER_WORDS)) + r")(?!\w)"

fluency/__pycache__/compute_fluency.cpython-312.pyc CHANGED Viewed

Binary files a/fluency/__pycache__/compute_fluency.cpython-312.pyc and b/fluency/__pycache__/compute_fluency.cpython-312.pyc differ

fluency/__pycache__/fluency_api.cpython-312.pyc CHANGED Viewed

Binary files a/fluency/__pycache__/fluency_api.cpython-312.pyc and b/fluency/__pycache__/fluency_api.cpython-312.pyc differ

fluency/compute_fluency.py CHANGED Viewed

@@ -6,9 +6,14 @@ import librosa
 import numpy as np
 from typing import Dict, Any, Union
 from .fluency import calc_srs, calculate_pas, calculate_fluency, get_fluency_insight
-from .filler_analyzer import detect_fillers
-def compute_fluency_score(file_path: str, whisper_model) -> Dict[str, Any]:
     """
     Compute fluency score and its components from a speech sample.
@@ -20,7 +25,7 @@ def compute_fluency_score(file_path: str, whisper_model) -> Dict[str, Any]:
         dict: A dictionary containing fluency score, SRS, PAS, and component scores.
     """
     # Transcribe audio
-    result = whisper_model.transcribe(file_path)
     transcript = result.get("text", "").strip()
     segments = result.get("segments", [])
@@ -28,8 +33,11 @@ def compute_fluency_score(file_path: str, whisper_model) -> Dict[str, Any]:
     if not transcript or not segments:
         raise ValueError("Empty transcript or segments from Whisper.")
-    # Detect filler words
-    filler_count, _ = detect_fillers(transcript)
     # Load audio
     y, sr = librosa.load(file_path, sr=None)
@@ -37,16 +45,20 @@ def compute_fluency_score(file_path: str, whisper_model) -> Dict[str, Any]:
     if duration <= 0:
         raise ValueError("Audio duration invalid or zero.")
-    # Calculate pitch variation (in semitones)
-    f0, voiced_flags, voiced_probs = librosa.pyin(
-        y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan)
-    voiced_f0 = f0[~np.isnan(f0)]
     pitch_variation = 0.0
     if voiced_f0.size > 0:
-        median_f0 = np.nanmedian(voiced_f0)
         median_f0 = max(median_f0, 1e-6)
         semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
-        pitch_variation = float(np.nanstd(semitone_diffs))
     # Analyze pauses
     long_pause_count = 0
@@ -85,22 +97,14 @@ def compute_fluency_score(file_path: str, whisper_model) -> Dict[str, Any]:
     # Calculate final fluency score
     fluency_result = calculate_fluency(srs=srs_score, pas=pas_score)
     fluency_score = fluency_result["score"]
-    insight = get_fluency_insight(fluency_score)
-    # Build and return comprehensive result
     return {
         "fluency_score": fluency_score,
-        "insight": insight,
         "SRS": srs_score,
         "PAS": pas_score,
-        "components": {
-            "wpm": words_per_min,
-            "filler_count": filler_count,
-            "long_pause_count": long_pause_count,
-            "pitch_variation": pitch_variation,
-            "word_count": word_count,
-            "duration": duration,
-            "pas_components": pas_result
-        },
         "transcript": transcript
-    }

 import numpy as np
 from typing import Dict, Any, Union
 from .fluency import calc_srs, calculate_pas, calculate_fluency, get_fluency_insight
+from filler_count.filler_score import analyze_fillers
+from typing import Dict, Any
+import numpy as np
+import librosa
+import pyworld
+def compute_fluency_score(file_path: str, whisper_model, filler_count= None) -> Dict[str, Any]:
     """
     Compute fluency score and its components from a speech sample.
         dict: A dictionary containing fluency score, SRS, PAS, and component scores.
     """
     # Transcribe audio
+    result = whisper_model.transcribe(file_path, word_timestamps=False, fp16=False)
     transcript = result.get("text", "").strip()
     segments = result.get("segments", [])
     if not transcript or not segments:
         raise ValueError("Empty transcript or segments from Whisper.")
+    if filler_count is None:
+        # Detect filler words
+        result = analyze_fillers(file_path,"base", transcript)
+        filler_score = result.get("filler_score", 0)
+        filler_count = result.get("total_fillers", 0)
     # Load audio
     y, sr = librosa.load(file_path, sr=None)
     if duration <= 0:
         raise ValueError("Audio duration invalid or zero.")
+    # Calculate pitch variation (in semitones) using pyworld
+    _f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr)
+    f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr)
+    voiced_f0 = f0[f0 > 0]
+    voiced_f0 = voiced_f0[
+        (voiced_f0 > np.percentile(voiced_f0, 5)) &
+        (voiced_f0 < np.percentile(voiced_f0, 95))
+    ]
     pitch_variation = 0.0
     if voiced_f0.size > 0:
+        median_f0 = np.median(voiced_f0)
         median_f0 = max(median_f0, 1e-6)
         semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
+        pitch_variation = float(np.std(semitone_diffs))
     # Analyze pauses
     long_pause_count = 0
     # Calculate final fluency score
     fluency_result = calculate_fluency(srs=srs_score, pas=pas_score)
     fluency_score = fluency_result["score"]
     return {
         "fluency_score": fluency_score,
         "SRS": srs_score,
         "PAS": pas_score,
+        "pitch_variation": pitch_variation,
+        "filler_count": filler_count,
+        "long_pause_count": long_pause_count,
+        "WPM": words_per_min,
         "transcript": transcript
+    }

fluency/fluency_api.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import whisper
 from .compute_fluency import compute_fluency_score
-def main(file_path: str, model_size: str = "base") -> dict:
     try:
         whisper_model = whisper.load_model(model_size)
-        results = compute_fluency_score(file_path, whisper_model)
         # Structure response
         response = {

 import whisper
 from .compute_fluency import compute_fluency_score
+def main(file_path: str, model_size: str = "base", filler_count = None) -> dict:
     try:
         whisper_model = whisper.load_model(model_size)
+        results = compute_fluency_score(file_path, whisper_model, filler_count)
         # Structure response
         response = {

vcs/__pycache__/compute_vcs.cpython-312.pyc CHANGED Viewed

Binary files a/vcs/__pycache__/compute_vcs.cpython-312.pyc and b/vcs/__pycache__/compute_vcs.cpython-312.pyc differ

vcs/compute_vcs.py CHANGED Viewed

@@ -19,7 +19,7 @@ def compute_voice_clarity_score(file_path: str, whisper_model) -> Dict[str, Any]
         dict: A dictionary containing Voice Clarity Score and component scores.
     """
     # Transcribe audio
-    result = whisper_model.transcribe(file_path)
     transcript = result.get("text", "").strip()
     segments = result.get("segments", [])
@@ -36,8 +36,6 @@ def compute_voice_clarity_score(file_path: str, whisper_model) -> Dict[str, Any]
     # Calculate Voice Clarity Score
     clarity_result = calculate_voice_clarity_score(y, sr, segments)
-    # Add transcript to results
-    clarity_result["transcript"] = transcript
     # Add word count and duration info for reference
     word_count = len(transcript.split())
@@ -61,54 +59,10 @@ def analyze_voice_quality(file_path: str, whisper_model) -> Dict[str, Any]:
     clarity_results = compute_voice_clarity_score(file_path, whisper_model)
     vcs = clarity_results["VCS"]
-    # Load audio for additional analysis
-    y, sr = librosa.load(file_path, sr=None)
-    # Calculate additional voice quality metrics
-    # Voice stability - based on pitch (F0) stability
-    f0, voiced_flags, voiced_probs = librosa.pyin(
-        y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan)
-    voiced_f0 = f0[~np.isnan(f0)]
-    pitch_stability = 0.0
-    if voiced_f0.size > 0:
-        # Calculate coefficient of variation (lower is more stable)
-        cv = np.std(voiced_f0) / np.mean(voiced_f0) if np.mean(voiced_f0) > 0 else float('inf')
-        # Convert to score (0-100)
-        pitch_stability = max(0, min(100, 100 - (cv * 100)))
-    # Voice resonance - based on spectral bandwidth
-    bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
-    # Normalize (ideal range is around 1500-2500 Hz for speech)
-    if bandwidth < 1000:
-        resonance_score = max(0, bandwidth / 1000 * 70)  # Too narrow
-    elif bandwidth <= 2500:
-        resonance_score = 70 + ((bandwidth - 1000) / 1500 * 30)  # Optimal range
-    else:
-        resonance_score = max(0, 100 - ((bandwidth - 2500) / 2500 * 50))  # Too wide
-    # Voice strength - based on RMS energy
-    rms = np.mean(librosa.feature.rms(y=y))
-    # Normalize (typical speech RMS values range from 0.01 to 0.2)
-    strength_score = min(100, max(0, rms / 0.2 * 100))
-    # Combine additional metrics
-    additional_metrics = {
-        "pitch_stability": pitch_stability,
-        "voice_resonance": resonance_score,
-        "voice_strength": strength_score
-    }
     # Add to results
     combined_results = {
         "VCS": vcs,
-        "insight": clarity_results["insight"],
-        "components": {
-            **clarity_results["components"],
-            **additional_metrics
-        },
-        "transcript": clarity_results["transcript"]
     }
     return combined_results

         dict: A dictionary containing Voice Clarity Score and component scores.
     """
     # Transcribe audio
+    result = whisper_model.transcribe(file_path, word_timestamps=False, fp16=False)
     transcript = result.get("text", "").strip()
     segments = result.get("segments", [])
     # Calculate Voice Clarity Score
     clarity_result = calculate_voice_clarity_score(y, sr, segments)
     # Add word count and duration info for reference
     word_count = len(transcript.split())
     clarity_results = compute_voice_clarity_score(file_path, whisper_model)
     vcs = clarity_results["VCS"]
     # Add to results
     combined_results = {
         "VCS": vcs,
     }
     return combined_results

vers/__pycache__/compute_vers_score.cpython-312.pyc CHANGED Viewed

Binary files a/vers/__pycache__/compute_vers_score.cpython-312.pyc and b/vers/__pycache__/compute_vers_score.cpython-312.pyc differ

vers/__pycache__/vers.cpython-312.pyc CHANGED Viewed

Binary files a/vers/__pycache__/vers.cpython-312.pyc and b/vers/__pycache__/vers.cpython-312.pyc differ

vers/__pycache__/vers_api.cpython-312.pyc CHANGED Viewed

Binary files a/vers/__pycache__/vers_api.cpython-312.pyc and b/vers/__pycache__/vers_api.cpython-312.pyc differ

vers/compute_vers_score.py CHANGED Viewed

@@ -4,19 +4,23 @@ import numpy as np
 import math
 from .filler_analyzer import detect_fillers
 from .find_valence import  get_valence_score
-def compute_vers_score(file_path: str, whisper_model) -> dict:
     """
     Compute VERS (Vocal Emotional Regulation Score) and its components from a speech sample.
     """
-    result = whisper_model.transcribe(file_path)
     transcript = result.get("text", "").strip()
     segments = result.get("segments", [])
     # Filler count
-    filler_count, _ = detect_fillers(transcript)
     # Load audio
     y, sr = librosa.load(file_path, sr=None)
@@ -32,16 +36,20 @@ def compute_vers_score(file_path: str, whisper_model) -> dict:
     vol_max = np.max(np.abs(y)) if y.size > 0 else 0.0
     vol_max_db = 20 * math.log10(vol_max + 1e-6) if vol_max > 0 else -80.0
-    # Pitch variation
-    f0, voiced_flags, voiced_probs = librosa.pyin(
-        y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan)
-    voiced_f0 = f0[~np.isnan(f0)]
     pitch_variation = 0.0
     if voiced_f0.size > 0:
-        median_f0 = np.nanmedian(voiced_f0)
         median_f0 = max(median_f0, 1e-6)
         semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
-        pitch_variation = float(np.nanstd(semitone_diffs))
     # Pause analysis
     total_speaking_time = 0.0

 import math
 from .filler_analyzer import detect_fillers
 from .find_valence import  get_valence_score
+from filler_count.filler_score import analyze_fillers
+import pyworld
+def compute_vers_score(file_path: str, whisper_model, filler_count = None) -> dict:
     """
     Compute VERS (Vocal Emotional Regulation Score) and its components from a speech sample.
     """
+    result = whisper_model.transcribe(file_path, word_timestamps=False, fp16=False)
     transcript = result.get("text", "").strip()
     segments = result.get("segments", [])
+    if filler_count is None:
     # Filler count
+        result = analyze_fillers(file_path,'base', transcript)
+        filler_count = result.get("filler_count", 0)
     # Load audio
     y, sr = librosa.load(file_path, sr=None)
     vol_max = np.max(np.abs(y)) if y.size > 0 else 0.0
     vol_max_db = 20 * math.log10(vol_max + 1e-6) if vol_max > 0 else -80.0
+    # Calculate pitch variation (in semitones) using pyworld
+    _f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr)
+    f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr)
+    voiced_f0 = f0[f0 > 0]
+    voiced_f0 = voiced_f0[
+        (voiced_f0 > np.percentile(voiced_f0, 5)) &
+        (voiced_f0 < np.percentile(voiced_f0, 95))
+    ]
     pitch_variation = 0.0
     if voiced_f0.size > 0:
+        median_f0 = np.median(voiced_f0)
         median_f0 = max(median_f0, 1e-6)
         semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
+        pitch_variation = float(np.std(semitone_diffs))
     # Pause analysis
     total_speaking_time = 0.0

vers/vers.py CHANGED Viewed

@@ -22,7 +22,7 @@ def calc_ess(pitch_variation, vol_max_db, mean_volume_db, valence_scores):
     valence_stability = 100 - (np.std(valence_scores) * 20)
     ESS = (0.45 * float(tonal_steadiness)) + (0.35 * float(loudness_stability)) + (0.2 * float(valence_stability))
-    print(f" tonal_steadiness: {tonal_steadiness}, loudness_stability: {loudness_stability}, valence_stability: {valence_stability}")
     return ESS
 def calc_lcs(volume_std, vol_max_db, mean_volume_db):
@@ -67,7 +67,7 @@ def calc_srs(wpm, filler_count, long_pause_count, pitch_variation):
     # Final SRS Score
     SRS = (0.45 * wpm_consistency) + (0.55 * stability)
-    print(f"wpm_consistency: {wpm_consistency}, stability: {stability}")
     return min(100, max(0, SRS))
 def calc_vers(filler_count, long_pause_count, pitch_variation, mean_volume_db, vol_max_db, wpm, volume_std, valence_scores):
@@ -93,10 +93,10 @@ def calc_vers(filler_count, long_pause_count, pitch_variation, mean_volume_db, v
     return {
         "VERS": int(VERS),
-        "ESS": round(ESS, 1),
-        "LCS": round(LCS, 1),
-        "SRS": round(SRS, 1),
-        "insight": insight
     }
 # # Test input

     valence_stability = 100 - (np.std(valence_scores) * 20)
     ESS = (0.45 * float(tonal_steadiness)) + (0.35 * float(loudness_stability)) + (0.2 * float(valence_stability))
+    #print(f" tonal_steadiness: {tonal_steadiness}, loudness_stability: {loudness_stability}, valence_stability: {valence_stability}")
     return ESS
 def calc_lcs(volume_std, vol_max_db, mean_volume_db):
     # Final SRS Score
     SRS = (0.45 * wpm_consistency) + (0.55 * stability)
+    #print(f"wpm_consistency: {wpm_consistency}, stability: {stability}")
     return min(100, max(0, SRS))
 def calc_vers(filler_count, long_pause_count, pitch_variation, mean_volume_db, vol_max_db, wpm, volume_std, valence_scores):
     return {
         "VERS": int(VERS),
+        # "ESS": round(ESS, 1),
+        # "LCS": round(LCS, 1),
+        # "SRS": round(SRS, 1),
+        # "insight": insight
     }
 # # Test input

vers/vers_api.py CHANGED Viewed

@@ -17,13 +17,13 @@ def convert_numpy_types(obj):
     else:
         return obj
-def main(file_path: str, model_size: str = "base") -> dict:
     try:
         # Load whisper model
         whisper_model = whisper.load_model(model_size)
         # Compute VERS score
-        results = compute_vers_score(file_path, whisper_model)
         # Convert any NumPy types to native Python types
         results = convert_numpy_types(results)

     else:
         return obj
+def main(file_path: str, model_size: str = "base", filler_count = None) -> dict:
     try:
         # Load whisper model
         whisper_model = whisper.load_model(model_size)
         # Compute VERS score
+        results = compute_vers_score(file_path, whisper_model, filler_count)
         # Convert any NumPy types to native Python types
         results = convert_numpy_types(results)

voice_confidence_score/__pycache__/voice_confidence.cpython-312.pyc CHANGED Viewed

Binary files a/voice_confidence_score/__pycache__/voice_confidence.cpython-312.pyc and b/voice_confidence_score/__pycache__/voice_confidence.cpython-312.pyc differ

voice_confidence_score/__pycache__/voice_confidence_api.cpython-312.pyc CHANGED Viewed

Binary files a/voice_confidence_score/__pycache__/voice_confidence_api.cpython-312.pyc and b/voice_confidence_score/__pycache__/voice_confidence_api.cpython-312.pyc differ

voice_confidence_score/voice_confidence.py CHANGED Viewed

@@ -5,11 +5,11 @@ from fluency.compute_fluency import compute_fluency_score
 from vcs.compute_vcs import analyze_voice_quality
-def calc_fluency_score(audio_path, whisper_model):
  # Calculate fluency score
     print(f"Analyzing fluency for {audio_path}...")
-    results = compute_fluency_score(audio_path, whisper_model)
     fluency_score = results['fluency_score']
     return fluency_score
@@ -26,9 +26,12 @@ def calc_vcs(audio_path, whisper_model):
 dominance = 5.6 # dummy for now i add later
-def calc_voice_confidence_score(audio_path, model):
-    fluency_score = calc_fluency_score(audio_path, model)
     vcs = calc_vcs(audio_path, model)
     # Calculate voice confidence score

 from vcs.compute_vcs import analyze_voice_quality
+def calc_fluency_score(audio_path, whisper_model, filler_count=None):
  # Calculate fluency score
     print(f"Analyzing fluency for {audio_path}...")
+    results = compute_fluency_score(audio_path, whisper_model, filler_count)
     fluency_score = results['fluency_score']
     return fluency_score
 dominance = 5.6 # dummy for now i add later
+def calc_voice_confidence_score(audio_path, model, filler_count= None, fluency_score=None):
+    if fluency_score is None:
+        print(' No args passed Calling calc_fluency_score')
+        fluency_score = calc_fluency_score(audio_path, model, filler_count)
     vcs = calc_vcs(audio_path, model)
     # Calculate voice confidence score

voice_confidence_score/voice_confidence_api.py CHANGED Viewed

@@ -1,13 +1,13 @@
 import whisper
 from .voice_confidence import calc_voice_confidence_score
-def main(file_path: str, model_size: str = "base") -> dict:
     try:
         # Load the Whisper model
         whisper_model = whisper.load_model(model_size)
         # Calculate the voice confidence score
-        result = calc_voice_confidence_score(file_path, whisper_model)
         # Return the result as a dictionary
         return {"voice_confidence_score": round(result, 2)}

 import whisper
 from .voice_confidence import calc_voice_confidence_score
+def main(file_path: str, model_size: str = "base", filler_count = None, fluency_score = None) -> dict:
     try:
         # Load the Whisper model
         whisper_model = whisper.load_model(model_size)
         # Calculate the voice confidence score
+        result = calc_voice_confidence_score(file_path, whisper_model, filler_count, fluency_score)
         # Return the result as a dictionary
         return {"voice_confidence_score": round(result, 2)}

vps/__pycache__/compute_vps_score.cpython-312.pyc CHANGED Viewed

Binary files a/vps/__pycache__/compute_vps_score.cpython-312.pyc and b/vps/__pycache__/compute_vps_score.cpython-312.pyc differ

vps/__pycache__/vps.cpython-312.pyc CHANGED Viewed

Binary files a/vps/__pycache__/vps.cpython-312.pyc and b/vps/__pycache__/vps.cpython-312.pyc differ

vps/__pycache__/vps_api.cpython-312.pyc CHANGED Viewed

Binary files a/vps/__pycache__/vps_api.cpython-312.pyc and b/vps/__pycache__/vps_api.cpython-312.pyc differ

vps/compute_vps_score.py CHANGED Viewed

@@ -2,7 +2,9 @@ from .vps import calculate_vps  # Your file where calc_srs, calculate_pas, calcu
 import librosa
 import numpy as np
 import math
-from .filler_analyzer import detect_fillers
 def compute_vps_score(file_path: str, whisper_model) -> dict:
     """
@@ -16,7 +18,7 @@ def compute_vps_score(file_path: str, whisper_model) -> dict:
         dict: A dictionary containing VPS, SRS, PAS, RCS, and component scores.
     """
     # Transcribe
-    result = whisper_model.transcribe(file_path)
     transcript = result.get("text", "").strip()
     segments = result.get("segments", [])
@@ -25,7 +27,8 @@ def compute_vps_score(file_path: str, whisper_model) -> dict:
         raise ValueError("Empty transcript or segments from Whisper.")
     # Filler count
-    filler_count, _ = detect_fillers(transcript)
     # Load audio
     y, sr = librosa.load(file_path, sr=None)
@@ -33,16 +36,20 @@ def compute_vps_score(file_path: str, whisper_model) -> dict:
     if duration <= 0:
         raise ValueError("Audio duration invalid or zero.")
-    # Pitch variation (in semitones)
-    f0, voiced_flags, voiced_probs = librosa.pyin(
-        y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan)
-    voiced_f0 = f0[~np.isnan(f0)]
     pitch_variation = 0.0
     if voiced_f0.size > 0:
-        median_f0 = np.nanmedian(voiced_f0)
         median_f0 = max(median_f0, 1e-6)
         semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
-        pitch_variation = float(np.nanstd(semitone_diffs))
     # Pause analysis
     long_pause_count = 0
@@ -74,6 +81,4 @@ def compute_vps_score(file_path: str, whisper_model) -> dict:
         sr=sr
     )
-    # Include transcript optionally
-    vps_result["transcript"] = transcript
     return vps_result

 import librosa
 import numpy as np
 import math
+import pyworld
+from filler_count.filler_score import analyze_fillers
 def compute_vps_score(file_path: str, whisper_model) -> dict:
     """
         dict: A dictionary containing VPS, SRS, PAS, RCS, and component scores.
     """
     # Transcribe
+    result = whisper_model.transcribe(file_path, word_timestamps=False, fp16=False)
     transcript = result.get("text", "").strip()
     segments = result.get("segments", [])
         raise ValueError("Empty transcript or segments from Whisper.")
     # Filler count
+    result = analyze_fillers(file_path,'base',transcript)
+    filler_count = result.get("filler_count", 0)
     # Load audio
     y, sr = librosa.load(file_path, sr=None)
     if duration <= 0:
         raise ValueError("Audio duration invalid or zero.")
+    # Calculate pitch variation (in semitones) using pyworld
+    _f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr)
+    f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr)
+    voiced_f0 = f0[f0 > 0]
+    voiced_f0 = voiced_f0[
+        (voiced_f0 > np.percentile(voiced_f0, 5)) &
+        (voiced_f0 < np.percentile(voiced_f0, 95))
+    ]
     pitch_variation = 0.0
     if voiced_f0.size > 0:
+        median_f0 = np.median(voiced_f0)
         median_f0 = max(median_f0, 1e-6)
         semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
+        pitch_variation = float(np.std(semitone_diffs))
     # Pause analysis
     long_pause_count = 0
         sr=sr
     )
     return vps_result