mulasagg commited on
Commit
aef3b1e
·
1 Parent(s): e6cd41c

API optimizations

Browse files
app.py CHANGED
@@ -1,14 +1,15 @@
1
- from fastapi import FastAPI, UploadFile, File, Form , HTTPException
2
  from fastapi.responses import JSONResponse
3
  from fastapi.middleware.cors import CORSMiddleware
4
- import sys
5
  import os
6
  import shutil
7
  import uuid
 
 
 
 
8
 
9
- # Ensure sibling module fluency is discoverable
10
- #sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
11
-
12
  from fluency.fluency_api import main as analyze_fluency_main
13
  from tone_modulation.tone_api import main as analyze_tone_main
14
  from vcs.vcs_api import main as analyze_vcs_main
@@ -18,375 +19,198 @@ from vps.vps_api import main as analyze_vps_main
18
  from ves.ves import calc_voice_engagement_score
19
  from transcribe import transcribe_audio
20
  from filler_count.filler_score import analyze_fillers
21
- #from emotion.emo_predict import predict_emotion
22
 
23
  app = FastAPI()
24
 
25
  app.add_middleware(
26
  CORSMiddleware,
27
- allow_origins=["*"], # In production, replace "*" with allowed frontend domains
28
  allow_credentials=True,
29
  allow_methods=["*"],
30
  allow_headers=["*"],
31
  )
32
 
33
- @app.post("/analyze_fluency/")
34
- async def analyze_fluency(file: UploadFile):
35
- # idk if we can use pydantic model here If we need I can add later
36
- if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
37
- raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
38
-
39
- # Generate a safe temporary file path for temporary storage of the uploaded file this will be deleted after processing
40
- temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
41
- temp_dir = "temp_uploads"
42
- temp_filepath = os.path.join(temp_dir, temp_filename)
43
- os.makedirs(temp_dir, exist_ok=True)
44
-
45
- try:
46
- # Save uploaded file
47
- with open(temp_filepath, "wb") as buffer:
48
- shutil.copyfileobj(file.file, buffer)
49
 
50
-
51
- result = analyze_fluency_main(temp_filepath, model_size="base")
52
-
53
- return JSONResponse(content=result)
54
-
55
- except Exception as e:
56
- raise HTTPException(status_code=500, detail=f"Fluency analysis failed: {str(e)}")
57
-
58
- finally:
59
- # Clean up temporary file
60
- if os.path.exists(temp_filepath):
61
- os.remove(temp_filepath)
62
-
63
- @app.post('/analyze_tone/')
64
- async def analyze_tone(file: UploadFile):
65
- """
66
- Endpoint to analyze tone of an uploaded audio file (.wav or .mp3).
67
- """
68
- if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
69
- raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
70
-
71
- # Generate a safe temporary file path
72
- temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
73
  temp_dir = "temp_uploads"
74
- temp_filepath = os.path.join(temp_dir, temp_filename)
75
  os.makedirs(temp_dir, exist_ok=True)
76
-
 
 
77
  try:
78
- # Save uploaded file
79
  with open(temp_filepath, "wb") as buffer:
80
- shutil.copyfileobj(file.file, buffer)
81
-
82
- # Analyze tone using your custom function
83
- result = analyze_tone_main(temp_filepath)
84
-
85
- return JSONResponse(content=result)
86
-
87
- except Exception as e:
88
- raise HTTPException(status_code=500, detail=f"Tone analysis failed: {str(e)}")
89
-
90
  finally:
91
- # Clean up temporary file
92
  if os.path.exists(temp_filepath):
93
  os.remove(temp_filepath)
94
 
95
- @app.post('/analyze_vcs/')
96
- async def analyze_vcs(file: UploadFile):
97
- """
98
- Endpoint to analyze voice clarity of an uploaded audio file (.wav or .mp3).
99
- """
100
- if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
101
- raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
 
 
 
 
102
 
103
- # Generate a safe temporary file path
104
- temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
105
- temp_dir = "temp_uploads"
106
- temp_filepath = os.path.join(temp_dir, temp_filename)
107
- os.makedirs(temp_dir, exist_ok=True)
108
-
109
- try:
110
- # Save uploaded file
111
- with open(temp_filepath, "wb") as buffer:
112
- shutil.copyfileobj(file.file, buffer)
113
-
114
- # Analyze voice clarity using your custom function
115
- result = analyze_vcs_main(temp_filepath)
116
 
117
- return JSONResponse(content=result)
 
 
118
 
119
- except Exception as e:
120
- raise HTTPException(status_code=500, detail=f"Voice clarity analysis failed: {str(e)}")
 
121
 
122
- finally:
123
- # Clean up temporary file
124
- if os.path.exists(temp_filepath):
125
- os.remove(temp_filepath)
126
 
127
  @app.post('/analyze_vers/')
128
  async def analyze_vers(file: UploadFile):
129
- """
130
- Endpoint to analyze VERS of an uploaded audio file (.wav or .mp3).
131
- """
132
- if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
133
- raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
134
-
135
- # Generate a safe temporary file path
136
- temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
137
- temp_dir = "temp_uploads"
138
- temp_filepath = os.path.join(temp_dir, temp_filename)
139
- os.makedirs(temp_dir, exist_ok=True)
140
 
141
- try:
142
- # Save uploaded file
143
- with open(temp_filepath, "wb") as buffer:
144
- shutil.copyfileobj(file.file, buffer)
145
-
146
- # Analyze VERS using your custom function
147
- result = analyze_vers_main(temp_filepath)
148
-
149
- return JSONResponse(content=result)
150
-
151
- except Exception as e:
152
- raise HTTPException(status_code=500, detail=f"VERS analysis failed: {str(e)}")
153
-
154
- finally:
155
- # Clean up temporary file
156
- if os.path.exists(temp_filepath):
157
- os.remove(temp_filepath)
158
-
159
  @app.post('/voice_confidence/')
160
  async def analyze_voice_confidence(file: UploadFile):
161
- """
162
- Endpoint to analyze voice confidence of an uploaded audio file (.wav or .mp3).
163
- """
164
- if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
165
- raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
166
-
167
- # Generate a safe temporary file path
168
- temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
169
- temp_dir = "temp_uploads"
170
- temp_filepath = os.path.join(temp_dir, temp_filename)
171
- os.makedirs(temp_dir, exist_ok=True)
172
-
173
- try:
174
- # Save uploaded file
175
- with open(temp_filepath, "wb") as buffer:
176
- shutil.copyfileobj(file.file, buffer)
177
-
178
- # Analyze voice confidence using your custom function
179
- result = analyze_voice_confidence_main(temp_filepath)
180
-
181
- return JSONResponse(content=result)
182
-
183
- except Exception as e:
184
- raise HTTPException(status_code=500, detail=f"Voice confidence analysis failed: {str(e)}")
185
-
186
- finally:
187
- # Clean up temporary file
188
- if os.path.exists(temp_filepath):
189
- os.remove(temp_filepath)
190
 
191
  @app.post('/analyze_vps/')
192
  async def analyze_vps(file: UploadFile):
193
- """
194
- Endpoint to analyze voice pacing score of an uploaded audio file (.wav or .mp3).
195
- """
196
- if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
197
- raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
198
-
199
- # Generate a safe temporary file path
200
- temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
201
- temp_dir = "temp_uploads"
202
- temp_filepath = os.path.join(temp_dir, temp_filename)
203
- os.makedirs(temp_dir, exist_ok=True)
204
-
205
- try:
206
- # Save uploaded file
207
- with open(temp_filepath, "wb") as buffer:
208
- shutil.copyfileobj(file.file, buffer)
209
-
210
- # Analyze voice pacing score using your custom function
211
- result = analyze_vps_main(temp_filepath)
212
-
213
- return JSONResponse(content=result)
214
-
215
- except Exception as e:
216
- raise HTTPException(status_code=500, detail=f"Voice pacing score analysis failed: {str(e)}")
217
-
218
- finally:
219
- # Clean up temporary file
220
- if os.path.exists(temp_filepath):
221
- os.remove(temp_filepath)
222
 
223
  @app.post('/voice_engagement_score/')
224
  async def analyze_voice_engagement_score(file: UploadFile):
225
- """
226
- Endpoint to analyze voice engagement score of an uploaded audio file (.wav or .mp3).
227
- """
228
- if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
229
- raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
230
-
231
- # Generate a safe temporary file path
232
- temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
233
- temp_dir = "temp_uploads"
234
- temp_filepath = os.path.join(temp_dir, temp_filename)
235
- os.makedirs(temp_dir, exist_ok=True)
236
-
237
- try:
238
- # Save uploaded file
239
- with open(temp_filepath, "wb") as buffer:
240
- shutil.copyfileobj(file.file, buffer)
241
-
242
- # Analyze voice engagement score using your custom function
243
- result = calc_voice_engagement_score(temp_filepath)
244
-
245
- return JSONResponse(content=result)
246
-
247
- except Exception as e:
248
- raise HTTPException(status_code=500, detail=f"Voice engagement score analysis failed: {str(e)}")
249
-
250
- finally:
251
- # Clean up temporary file
252
- if os.path.exists(temp_filepath):
253
- os.remove(temp_filepath)
254
 
255
  @app.post('/analyze_fillers/')
256
  async def analyze_fillers_count(file: UploadFile):
257
- """
258
- Endpoint to analyze filler words in an uploaded audio file (.wav or .mp3).
259
- """
260
- if not file.filename.endswith(('.wav', '.mp3','.mp4','.m4a','.flac')):
261
- raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
262
-
263
- # Generate a safe temporary file path
264
- temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
265
- temp_dir = "temp_uploads"
266
- temp_filepath = os.path.join(temp_dir, temp_filename)
267
- os.makedirs(temp_dir, exist_ok=True)
268
-
269
- try:
270
- # Save uploaded file
271
- with open(temp_filepath, "wb") as buffer:
272
- shutil.copyfileobj(file.file, buffer)
273
-
274
- # Call the analysis function with the file path
275
- result = analyze_fillers(temp_filepath) # Pass the file path, not the UploadFile object
276
-
277
- return JSONResponse(content=result)
278
-
279
- except Exception as e:
280
- raise HTTPException(status_code=500, detail=f"Filler analysis failed: {str(e)}")
281
-
282
- finally:
283
- # Clean up temporary file
284
- if os.path.exists(temp_filepath):
285
- os.remove(temp_filepath)
286
-
287
-
288
- import time
289
-
290
-
291
 
292
  @app.post('/transcribe/')
293
  async def transcribe(file: UploadFile):
294
- """
295
- Endpoint to transcribe an uploaded audio file ('.wav', '.mp3','mp4','.m4a','.flac' ).
296
- """
297
- #calculate time to transcribe
298
- start_time = time.time()
299
- if not file.filename.endswith(('.wav', '.mp3','mp4','.m4a','.flac')):
300
- raise HTTPException(status_code=400, detail="Invalid file type. Only .wav ,mp4 and .mp3 files are supported.")
301
 
302
- # Generate a safe temporary file path
303
- temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
304
- temp_dir = "temp_uploads"
305
- temp_filepath = os.path.join(temp_dir, temp_filename)
306
- os.makedirs(temp_dir, exist_ok=True)
307
-
308
- try:
309
- # Save uploaded file
310
- with open(temp_filepath, "wb") as buffer:
311
- shutil.copyfileobj(file.file, buffer)
312
-
313
- # Transcribe using your custom function
314
- result = transcribe_audio(temp_filepath, model_size="base")
315
- end_time = time.time()
316
- transcription_time = end_time - start_time
317
- response = {
318
- "transcription": result,
319
- "transcription_time": transcription_time
320
- }
321
-
322
- return JSONResponse(content=response)
323
-
324
- except Exception as e:
325
- raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}")
326
-
327
- finally:
328
- # Clean up temporary file
329
- if os.path.exists(temp_filepath):
330
- os.remove(temp_filepath)
331
-
332
- import datetime
333
 
334
  @app.post('/analyze_all/')
335
  async def analyze_all(file: UploadFile):
336
- """
337
- Endpoint to analyze all aspects of an uploaded audio file (.wav or .mp3).
338
- """
339
  print(f"Received request at {datetime.datetime.now()} for file: {file.filename}")
340
- if not file.filename.endswith(('.wav', '.mp3','.m4a','.mp4','.flac')):
341
- raise HTTPException(status_code=400, detail="Invalid file type. Only .wav and .mp3 files are supported.")
342
 
343
- # Generate a safe temporary file path
344
- temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(file.filename)[1]}"
345
- temp_dir = "temp_uploads"
346
- temp_filepath = os.path.join(temp_dir, temp_filename)
347
- os.makedirs(temp_dir, exist_ok=True)
348
-
349
- try:
350
- # Save uploaded file
351
- with open(temp_filepath, "wb") as buffer:
352
- shutil.copyfileobj(file.file, buffer)
353
-
354
- # Analyze all aspects using your custom functions
355
- fluency_result = analyze_fluency_main(temp_filepath, model_size="base")
356
- tone_result = analyze_tone_main(temp_filepath)
357
- vcs_result = analyze_vcs_main(temp_filepath)
358
- vers_result = analyze_vers_main(temp_filepath)
359
- voice_confidence_result = analyze_voice_confidence_main(temp_filepath)
360
- vps_result = analyze_vps_main(temp_filepath)
361
- ves_result = calc_voice_engagement_score(temp_filepath)
362
- filler_count = analyze_fillers(temp_filepath) # Assuming this function returns a dict with filler count
363
- transcript, language, _ = transcribe_audio(temp_filepath, "base") #fix this
364
- #emotion = predict_emotion(temp_filepath)
365
- avg_score = (fluency_result['fluency_score'] + tone_result['speech_dynamism_score'] + vcs_result['Voice Clarity Sore'] + vers_result['VERS Score'] + voice_confidence_result['voice_confidence_score'] + vps_result['VPS'] + ves_result['ves']) / 7
366
-
367
-
368
- # Combine results into a single response
369
- combined_result = {
370
- "fluency": fluency_result,
371
- "tone": tone_result,
372
- "vcs": vcs_result,
373
- "vers": vers_result,
374
- "voice_confidence": voice_confidence_result,
375
- "vps": vps_result,
376
- "ves": ves_result,
377
- "filler_words": filler_count,
378
- "transcript": transcript,
379
- "Detected Language": language,
380
- #"emotion": emotion ,
381
- "sank_score": avg_score
382
- }
383
-
384
- return JSONResponse(content=combined_result)
385
-
386
- except Exception as e:
387
- raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
388
 
389
- finally:
390
- # Clean up temporary file
391
- if os.path.exists(temp_filepath):
392
- os.remove(temp_filepath)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, File, HTTPException
2
  from fastapi.responses import JSONResponse
3
  from fastapi.middleware.cors import CORSMiddleware
 
4
  import os
5
  import shutil
6
  import uuid
7
+ import tempfile
8
+ import datetime
9
+ import time
10
+ from contextlib import contextmanager
11
 
12
+ # Import analysis functions (assumed to be modified to accept transcript)
 
 
13
  from fluency.fluency_api import main as analyze_fluency_main
14
  from tone_modulation.tone_api import main as analyze_tone_main
15
  from vcs.vcs_api import main as analyze_vcs_main
 
19
  from ves.ves import calc_voice_engagement_score
20
  from transcribe import transcribe_audio
21
  from filler_count.filler_score import analyze_fillers
22
+ from emotion.emo_predict import predict_emotion
23
 
24
  app = FastAPI()
25
 
26
  app.add_middleware(
27
  CORSMiddleware,
28
+ allow_origins=["*"], # Replace with specific domains in production
29
  allow_credentials=True,
30
  allow_methods=["*"],
31
  allow_headers=["*"],
32
  )
33
 
34
+ ALLOWED_EXTENSIONS = {'.wav', '.mp3', '.m4a', '.mp4', '.flac'}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
+ @contextmanager
37
+ def temp_file_handler(upload_file: UploadFile):
38
+ """Context manager to handle temporary file creation and cleanup."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  temp_dir = "temp_uploads"
 
40
  os.makedirs(temp_dir, exist_ok=True)
41
+ temp_filename = f"temp_{uuid.uuid4()}{os.path.splitext(upload_file.filename)[1]}"
42
+ temp_filepath = os.path.join(temp_dir, temp_filename)
43
+
44
  try:
 
45
  with open(temp_filepath, "wb") as buffer:
46
+ shutil.copyfileobj(upload_file.file, buffer)
47
+ yield temp_filepath
 
 
 
 
 
 
 
 
48
  finally:
 
49
  if os.path.exists(temp_filepath):
50
  os.remove(temp_filepath)
51
 
52
+ def validate_file_extension(filename: str):
53
+ """Validate if the file extension is allowed."""
54
+ if not os.path.splitext(filename)[1].lower() in ALLOWED_EXTENSIONS:
55
+ raise HTTPException(
56
+ status_code=400,
57
+ detail="Invalid file type. Only .wav, .mp3, .m4a, .mp4, and .flac files are supported."
58
+ )
59
+
60
+ async def process_audio_file(upload_file: UploadFile, analysis_func, **kwargs):
61
+ """Generic function to process an audio file with a given analysis function."""
62
+ validate_file_extension(upload_file.filename)
63
 
64
+ with temp_file_handler(upload_file) as temp_filepath:
65
+ try:
66
+ result = analysis_func(temp_filepath, **kwargs)
67
+ return JSONResponse(content=result)
68
+ except Exception as e:
69
+ raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
 
 
 
 
 
 
 
70
 
71
+ @app.post("/analyze_fluency/")
72
+ async def analyze_fluency(file: UploadFile):
73
+ return await process_audio_file(file, analyze_fluency_main, model_size="base")
74
 
75
+ @app.post('/analyze_tone/')
76
+ async def analyze_tone(file: UploadFile):
77
+ return await process_audio_file(file, analyze_tone_main)
78
 
79
+ @app.post('/analyze_vcs/')
80
+ async def analyze_vcs(file: UploadFile):
81
+ return await process_audio_file(file, analyze_vcs_main)
 
82
 
83
  @app.post('/analyze_vers/')
84
  async def analyze_vers(file: UploadFile):
85
+ return await process_audio_file(file, analyze_vers_main)
 
 
 
 
 
 
 
 
 
 
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  @app.post('/voice_confidence/')
88
  async def analyze_voice_confidence(file: UploadFile):
89
+ return await process_audio_file(file, analyze_voice_confidence_main)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  @app.post('/analyze_vps/')
92
  async def analyze_vps(file: UploadFile):
93
+ return await process_audio_file(file, analyze_vps_main)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  @app.post('/voice_engagement_score/')
96
  async def analyze_voice_engagement_score(file: UploadFile):
97
+ return await process_audio_file(file, calc_voice_engagement_score)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  @app.post('/analyze_fillers/')
100
  async def analyze_fillers_count(file: UploadFile):
101
+ return await process_audio_file(file, analyze_fillers)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
  @app.post('/transcribe/')
104
  async def transcribe(file: UploadFile):
105
+ validate_file_extension(file.filename)
 
 
 
 
 
 
106
 
107
+ start_time = time.time()
108
+ with temp_file_handler(file) as temp_filepath:
109
+ try:
110
+ transcript, language, _ = transcribe_audio(temp_filepath, model_size="base")
111
+ end_time = time.time()
112
+ response = {
113
+ "transcription": transcript,
114
+ "transcription_time": end_time - start_time,
115
+ "language": language
116
+ }
117
+ return JSONResponse(content=response)
118
+ except Exception as e:
119
+ raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
  @app.post('/analyze_all/')
122
  async def analyze_all(file: UploadFile):
123
+ """Endpoint to analyze all aspects of an uploaded audio file with single transcription."""
 
 
124
  print(f"Received request at {datetime.datetime.now()} for file: {file.filename}")
125
+ validate_file_extension(file.filename)
 
126
 
127
+ with temp_file_handler(file) as temp_filepath:
128
+ try:
129
+ # Generate transcript once
130
+ transcript, language, _ = transcribe_audio(temp_filepath, model_size="base")
131
+
132
+ # Pass transcript to analysis functions that support it
133
+ analyze_all_start = time.time()
134
+
135
+ # Compute filler count
136
+ filler_start = time.time()
137
+ filler_count = analyze_fillers(temp_filepath)
138
+ filler_count_number = filler_count.get("total_fillers", 0)
139
+ filler_end = time.time()
140
+ print(f"Filler analysis time: {filler_end - filler_start} seconds")
141
+
142
+ fluency_start = time.time()
143
+ fluency_result = analyze_fluency_main(temp_filepath, model_size="base", filler_count = filler_count_number)
144
+ fluency_score = fluency_result['fluency_score']
145
+ fluency_end = time.time()
146
+ print(f"Fluency analysis time: {fluency_end - fluency_start} seconds")
147
+
148
+ tone_start = time.time()
149
+ tone_result = analyze_tone_main(temp_filepath)
150
+ tone_end = time.time()
151
+ print(f"Tone analysis time: {tone_end - tone_start} seconds")
152
+
153
+ vcs_start = time.time()
154
+ vcs_result = analyze_vcs_main(temp_filepath)
155
+ vcs_end = time.time()
156
+ print(f"VCS analysis time: {vcs_end - vcs_start} seconds")
157
+
158
+ vers_start = time.time()
159
+ vers_result = analyze_vers_main(temp_filepath, model_size="base", filler_count = filler_count_number)
160
+ vers_end = time.time()
161
+ print(f"VERS analysis time: {vers_end - vers_start} seconds")
162
+
163
+ voice_confidence_start = time.time()
164
+ voice_confidence_result = analyze_voice_confidence_main(temp_filepath, model_size="base", filler_count = filler_count_number, fluency_score = fluency_score)
165
+ print("voice_confidence_result:", voice_confidence_result)
 
 
 
 
 
 
166
 
167
+ voice_confidence_end = time.time()
168
+ print(f"Voice confidence analysis time: {voice_confidence_end - voice_confidence_start} seconds")
169
+
170
+ vps_start = time.time()
171
+ vps_result = analyze_vps_main(temp_filepath)
172
+ vps_end = time.time()
173
+ print(f"VPS analysis time: {vps_end - vps_start} seconds")
174
+ ves_start = time.time()
175
+ ves_result = calc_voice_engagement_score(temp_filepath)
176
+ ves_end = time.time()
177
+ print(f"VES analysis time: {ves_end - ves_start} seconds")
178
+ emotion_start = time.time()
179
+ emotion = predict_emotion(temp_filepath)
180
+ emotion_end = time.time()
181
+ print(f"Emotion analysis time: {emotion_end - emotion_start} seconds")
182
+
183
+ # Calculate average score
184
+ avg_score = (
185
+ fluency_result['fluency_score'] +
186
+ tone_result['speech_dynamism_score'] +
187
+ vcs_result['Voice Clarity Sore'] +
188
+ vers_result['VERS Score'] +
189
+ voice_confidence_result['voice_confidence_score'] +
190
+ vps_result['VPS'] +
191
+ ves_result['ves']
192
+ ) / 7
193
+
194
+ analyze_all_end = time.time()
195
+
196
+ # Combine results
197
+ combined_result = {
198
+ "fluency": fluency_result,
199
+ "tone": tone_result,
200
+ "vcs": vcs_result,
201
+ "vers": vers_result,
202
+ "voice_confidence": voice_confidence_result,
203
+ "vps": vps_result,
204
+ "ves": ves_result,
205
+ "filler_words": filler_count,
206
+ "transcript": transcript,
207
+ "Detected Language": language,
208
+ "emotion": emotion,
209
+ "sank_score": avg_score,
210
+ "analysis_time": analyze_all_end - analyze_all_start,
211
+ }
212
+
213
+ return JSONResponse(content=combined_result)
214
+
215
+ except Exception as e:
216
+ raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
filler_count/__pycache__/filler_score.cpython-312.pyc CHANGED
Binary files a/filler_count/__pycache__/filler_score.cpython-312.pyc and b/filler_count/__pycache__/filler_score.cpython-312.pyc differ
 
filler_count/filler_score.py CHANGED
@@ -2,7 +2,7 @@ import re
2
  import whisper
3
  from pydub import AudioSegment # For accurate duration calculation
4
 
5
- def analyze_fillers(file_path: str, model_size: str = "base") -> dict:
6
  """
7
  Analyzes English filler words in audio with proper duration handling.
8
  """
@@ -18,10 +18,11 @@ def analyze_fillers(file_path: str, model_size: str = "base") -> dict:
18
  audio = AudioSegment.from_file(file_path)
19
  duration = len(audio) / 1000 # Convert ms to seconds
20
 
21
- # Then run Whisper transcription
22
- model = whisper.load_model(model_size)
23
- result = model.transcribe(file_path, word_timestamps=False, fp16=False)
24
- transcript = result["text"]
 
25
 
26
  # Case-insensitive regex matching
27
  pattern = r"(?<!\w)(" + "|".join(map(re.escape, FILLER_WORDS)) + r")(?!\w)"
 
2
  import whisper
3
  from pydub import AudioSegment # For accurate duration calculation
4
 
5
+ def analyze_fillers(file_path: str, model_size: str = "base", transcript = None ) -> dict:
6
  """
7
  Analyzes English filler words in audio with proper duration handling.
8
  """
 
18
  audio = AudioSegment.from_file(file_path)
19
  duration = len(audio) / 1000 # Convert ms to seconds
20
 
21
+ if transcript is None:
22
+ # Then run Whisper transcription
23
+ model = whisper.load_model(model_size)
24
+ result = model.transcribe(file_path, word_timestamps=False, fp16=False)
25
+ transcript = result["text"]
26
 
27
  # Case-insensitive regex matching
28
  pattern = r"(?<!\w)(" + "|".join(map(re.escape, FILLER_WORDS)) + r")(?!\w)"
fluency/__pycache__/compute_fluency.cpython-312.pyc CHANGED
Binary files a/fluency/__pycache__/compute_fluency.cpython-312.pyc and b/fluency/__pycache__/compute_fluency.cpython-312.pyc differ
 
fluency/__pycache__/fluency_api.cpython-312.pyc CHANGED
Binary files a/fluency/__pycache__/fluency_api.cpython-312.pyc and b/fluency/__pycache__/fluency_api.cpython-312.pyc differ
 
fluency/compute_fluency.py CHANGED
@@ -6,9 +6,14 @@ import librosa
6
  import numpy as np
7
  from typing import Dict, Any, Union
8
  from .fluency import calc_srs, calculate_pas, calculate_fluency, get_fluency_insight
9
- from .filler_analyzer import detect_fillers
10
 
11
- def compute_fluency_score(file_path: str, whisper_model) -> Dict[str, Any]:
 
 
 
 
 
12
  """
13
  Compute fluency score and its components from a speech sample.
14
 
@@ -20,7 +25,7 @@ def compute_fluency_score(file_path: str, whisper_model) -> Dict[str, Any]:
20
  dict: A dictionary containing fluency score, SRS, PAS, and component scores.
21
  """
22
  # Transcribe audio
23
- result = whisper_model.transcribe(file_path)
24
  transcript = result.get("text", "").strip()
25
  segments = result.get("segments", [])
26
 
@@ -28,8 +33,11 @@ def compute_fluency_score(file_path: str, whisper_model) -> Dict[str, Any]:
28
  if not transcript or not segments:
29
  raise ValueError("Empty transcript or segments from Whisper.")
30
 
31
- # Detect filler words
32
- filler_count, _ = detect_fillers(transcript)
 
 
 
33
 
34
  # Load audio
35
  y, sr = librosa.load(file_path, sr=None)
@@ -37,16 +45,20 @@ def compute_fluency_score(file_path: str, whisper_model) -> Dict[str, Any]:
37
  if duration <= 0:
38
  raise ValueError("Audio duration invalid or zero.")
39
 
40
- # Calculate pitch variation (in semitones)
41
- f0, voiced_flags, voiced_probs = librosa.pyin(
42
- y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan)
43
- voiced_f0 = f0[~np.isnan(f0)]
 
 
 
 
44
  pitch_variation = 0.0
45
  if voiced_f0.size > 0:
46
- median_f0 = np.nanmedian(voiced_f0)
47
  median_f0 = max(median_f0, 1e-6)
48
  semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
49
- pitch_variation = float(np.nanstd(semitone_diffs))
50
 
51
  # Analyze pauses
52
  long_pause_count = 0
@@ -85,22 +97,14 @@ def compute_fluency_score(file_path: str, whisper_model) -> Dict[str, Any]:
85
  # Calculate final fluency score
86
  fluency_result = calculate_fluency(srs=srs_score, pas=pas_score)
87
  fluency_score = fluency_result["score"]
88
- insight = get_fluency_insight(fluency_score)
89
 
90
- # Build and return comprehensive result
91
  return {
92
  "fluency_score": fluency_score,
93
- "insight": insight,
94
  "SRS": srs_score,
95
  "PAS": pas_score,
96
- "components": {
97
- "wpm": words_per_min,
98
- "filler_count": filler_count,
99
- "long_pause_count": long_pause_count,
100
- "pitch_variation": pitch_variation,
101
- "word_count": word_count,
102
- "duration": duration,
103
- "pas_components": pas_result
104
- },
105
  "transcript": transcript
106
- }
 
6
  import numpy as np
7
  from typing import Dict, Any, Union
8
  from .fluency import calc_srs, calculate_pas, calculate_fluency, get_fluency_insight
9
+ from filler_count.filler_score import analyze_fillers
10
 
11
+ from typing import Dict, Any
12
+ import numpy as np
13
+ import librosa
14
+ import pyworld
15
+
16
+ def compute_fluency_score(file_path: str, whisper_model, filler_count= None) -> Dict[str, Any]:
17
  """
18
  Compute fluency score and its components from a speech sample.
19
 
 
25
  dict: A dictionary containing fluency score, SRS, PAS, and component scores.
26
  """
27
  # Transcribe audio
28
+ result = whisper_model.transcribe(file_path, word_timestamps=False, fp16=False)
29
  transcript = result.get("text", "").strip()
30
  segments = result.get("segments", [])
31
 
 
33
  if not transcript or not segments:
34
  raise ValueError("Empty transcript or segments from Whisper.")
35
 
36
+ if filler_count is None:
37
+ # Detect filler words
38
+ result = analyze_fillers(file_path,"base", transcript)
39
+ filler_score = result.get("filler_score", 0)
40
+ filler_count = result.get("total_fillers", 0)
41
 
42
  # Load audio
43
  y, sr = librosa.load(file_path, sr=None)
 
45
  if duration <= 0:
46
  raise ValueError("Audio duration invalid or zero.")
47
 
48
+ # Calculate pitch variation (in semitones) using pyworld
49
+ _f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr)
50
+ f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr)
51
+ voiced_f0 = f0[f0 > 0]
52
+ voiced_f0 = voiced_f0[
53
+ (voiced_f0 > np.percentile(voiced_f0, 5)) &
54
+ (voiced_f0 < np.percentile(voiced_f0, 95))
55
+ ]
56
  pitch_variation = 0.0
57
  if voiced_f0.size > 0:
58
+ median_f0 = np.median(voiced_f0)
59
  median_f0 = max(median_f0, 1e-6)
60
  semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
61
+ pitch_variation = float(np.std(semitone_diffs))
62
 
63
  # Analyze pauses
64
  long_pause_count = 0
 
97
  # Calculate final fluency score
98
  fluency_result = calculate_fluency(srs=srs_score, pas=pas_score)
99
  fluency_score = fluency_result["score"]
 
100
 
 
101
  return {
102
  "fluency_score": fluency_score,
 
103
  "SRS": srs_score,
104
  "PAS": pas_score,
105
+ "pitch_variation": pitch_variation,
106
+ "filler_count": filler_count,
107
+ "long_pause_count": long_pause_count,
108
+ "WPM": words_per_min,
 
 
 
 
 
109
  "transcript": transcript
110
+ }
fluency/fluency_api.py CHANGED
@@ -1,12 +1,12 @@
1
  import whisper
2
  from .compute_fluency import compute_fluency_score
3
 
4
- def main(file_path: str, model_size: str = "base") -> dict:
5
  try:
6
 
7
  whisper_model = whisper.load_model(model_size)
8
 
9
- results = compute_fluency_score(file_path, whisper_model)
10
 
11
  # Structure response
12
  response = {
 
1
  import whisper
2
  from .compute_fluency import compute_fluency_score
3
 
4
+ def main(file_path: str, model_size: str = "base", filler_count = None) -> dict:
5
  try:
6
 
7
  whisper_model = whisper.load_model(model_size)
8
 
9
+ results = compute_fluency_score(file_path, whisper_model, filler_count)
10
 
11
  # Structure response
12
  response = {
vcs/__pycache__/compute_vcs.cpython-312.pyc CHANGED
Binary files a/vcs/__pycache__/compute_vcs.cpython-312.pyc and b/vcs/__pycache__/compute_vcs.cpython-312.pyc differ
 
vcs/compute_vcs.py CHANGED
@@ -19,7 +19,7 @@ def compute_voice_clarity_score(file_path: str, whisper_model) -> Dict[str, Any]
19
  dict: A dictionary containing Voice Clarity Score and component scores.
20
  """
21
  # Transcribe audio
22
- result = whisper_model.transcribe(file_path)
23
  transcript = result.get("text", "").strip()
24
  segments = result.get("segments", [])
25
 
@@ -36,8 +36,6 @@ def compute_voice_clarity_score(file_path: str, whisper_model) -> Dict[str, Any]
36
  # Calculate Voice Clarity Score
37
  clarity_result = calculate_voice_clarity_score(y, sr, segments)
38
 
39
- # Add transcript to results
40
- clarity_result["transcript"] = transcript
41
 
42
  # Add word count and duration info for reference
43
  word_count = len(transcript.split())
@@ -61,54 +59,10 @@ def analyze_voice_quality(file_path: str, whisper_model) -> Dict[str, Any]:
61
  clarity_results = compute_voice_clarity_score(file_path, whisper_model)
62
  vcs = clarity_results["VCS"]
63
 
64
- # Load audio for additional analysis
65
- y, sr = librosa.load(file_path, sr=None)
66
-
67
- # Calculate additional voice quality metrics
68
-
69
- # Voice stability - based on pitch (F0) stability
70
- f0, voiced_flags, voiced_probs = librosa.pyin(
71
- y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan)
72
- voiced_f0 = f0[~np.isnan(f0)]
73
-
74
- pitch_stability = 0.0
75
- if voiced_f0.size > 0:
76
- # Calculate coefficient of variation (lower is more stable)
77
- cv = np.std(voiced_f0) / np.mean(voiced_f0) if np.mean(voiced_f0) > 0 else float('inf')
78
- # Convert to score (0-100)
79
- pitch_stability = max(0, min(100, 100 - (cv * 100)))
80
-
81
- # Voice resonance - based on spectral bandwidth
82
- bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
83
- # Normalize (ideal range is around 1500-2500 Hz for speech)
84
- if bandwidth < 1000:
85
- resonance_score = max(0, bandwidth / 1000 * 70) # Too narrow
86
- elif bandwidth <= 2500:
87
- resonance_score = 70 + ((bandwidth - 1000) / 1500 * 30) # Optimal range
88
- else:
89
- resonance_score = max(0, 100 - ((bandwidth - 2500) / 2500 * 50)) # Too wide
90
-
91
- # Voice strength - based on RMS energy
92
- rms = np.mean(librosa.feature.rms(y=y))
93
- # Normalize (typical speech RMS values range from 0.01 to 0.2)
94
- strength_score = min(100, max(0, rms / 0.2 * 100))
95
-
96
- # Combine additional metrics
97
- additional_metrics = {
98
- "pitch_stability": pitch_stability,
99
- "voice_resonance": resonance_score,
100
- "voice_strength": strength_score
101
- }
102
 
103
  # Add to results
104
  combined_results = {
105
  "VCS": vcs,
106
- "insight": clarity_results["insight"],
107
- "components": {
108
- **clarity_results["components"],
109
- **additional_metrics
110
- },
111
- "transcript": clarity_results["transcript"]
112
  }
113
 
114
  return combined_results
 
19
  dict: A dictionary containing Voice Clarity Score and component scores.
20
  """
21
  # Transcribe audio
22
+ result = whisper_model.transcribe(file_path, word_timestamps=False, fp16=False)
23
  transcript = result.get("text", "").strip()
24
  segments = result.get("segments", [])
25
 
 
36
  # Calculate Voice Clarity Score
37
  clarity_result = calculate_voice_clarity_score(y, sr, segments)
38
 
 
 
39
 
40
  # Add word count and duration info for reference
41
  word_count = len(transcript.split())
 
59
  clarity_results = compute_voice_clarity_score(file_path, whisper_model)
60
  vcs = clarity_results["VCS"]
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  # Add to results
64
  combined_results = {
65
  "VCS": vcs,
 
 
 
 
 
 
66
  }
67
 
68
  return combined_results
vers/__pycache__/compute_vers_score.cpython-312.pyc CHANGED
Binary files a/vers/__pycache__/compute_vers_score.cpython-312.pyc and b/vers/__pycache__/compute_vers_score.cpython-312.pyc differ
 
vers/__pycache__/vers.cpython-312.pyc CHANGED
Binary files a/vers/__pycache__/vers.cpython-312.pyc and b/vers/__pycache__/vers.cpython-312.pyc differ
 
vers/__pycache__/vers_api.cpython-312.pyc CHANGED
Binary files a/vers/__pycache__/vers_api.cpython-312.pyc and b/vers/__pycache__/vers_api.cpython-312.pyc differ
 
vers/compute_vers_score.py CHANGED
@@ -4,19 +4,23 @@ import numpy as np
4
  import math
5
  from .filler_analyzer import detect_fillers
6
  from .find_valence import get_valence_score
 
 
7
 
8
- def compute_vers_score(file_path: str, whisper_model) -> dict:
9
  """
10
  Compute VERS (Vocal Emotional Regulation Score) and its components from a speech sample.
11
  """
12
- result = whisper_model.transcribe(file_path)
13
  transcript = result.get("text", "").strip()
14
  segments = result.get("segments", [])
15
 
16
 
17
 
 
18
  # Filler count
19
- filler_count, _ = detect_fillers(transcript)
 
20
 
21
  # Load audio
22
  y, sr = librosa.load(file_path, sr=None)
@@ -32,16 +36,20 @@ def compute_vers_score(file_path: str, whisper_model) -> dict:
32
  vol_max = np.max(np.abs(y)) if y.size > 0 else 0.0
33
  vol_max_db = 20 * math.log10(vol_max + 1e-6) if vol_max > 0 else -80.0
34
 
35
- # Pitch variation
36
- f0, voiced_flags, voiced_probs = librosa.pyin(
37
- y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan)
38
- voiced_f0 = f0[~np.isnan(f0)]
 
 
 
 
39
  pitch_variation = 0.0
40
  if voiced_f0.size > 0:
41
- median_f0 = np.nanmedian(voiced_f0)
42
  median_f0 = max(median_f0, 1e-6)
43
  semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
44
- pitch_variation = float(np.nanstd(semitone_diffs))
45
 
46
  # Pause analysis
47
  total_speaking_time = 0.0
 
4
  import math
5
  from .filler_analyzer import detect_fillers
6
  from .find_valence import get_valence_score
7
+ from filler_count.filler_score import analyze_fillers
8
+ import pyworld
9
 
10
+ def compute_vers_score(file_path: str, whisper_model, filler_count = None) -> dict:
11
  """
12
  Compute VERS (Vocal Emotional Regulation Score) and its components from a speech sample.
13
  """
14
+ result = whisper_model.transcribe(file_path, word_timestamps=False, fp16=False)
15
  transcript = result.get("text", "").strip()
16
  segments = result.get("segments", [])
17
 
18
 
19
 
20
+ if filler_count is None:
21
  # Filler count
22
+ result = analyze_fillers(file_path,'base', transcript)
23
+ filler_count = result.get("filler_count", 0)
24
 
25
  # Load audio
26
  y, sr = librosa.load(file_path, sr=None)
 
36
  vol_max = np.max(np.abs(y)) if y.size > 0 else 0.0
37
  vol_max_db = 20 * math.log10(vol_max + 1e-6) if vol_max > 0 else -80.0
38
 
39
+ # Calculate pitch variation (in semitones) using pyworld
40
+ _f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr)
41
+ f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr)
42
+ voiced_f0 = f0[f0 > 0]
43
+ voiced_f0 = voiced_f0[
44
+ (voiced_f0 > np.percentile(voiced_f0, 5)) &
45
+ (voiced_f0 < np.percentile(voiced_f0, 95))
46
+ ]
47
  pitch_variation = 0.0
48
  if voiced_f0.size > 0:
49
+ median_f0 = np.median(voiced_f0)
50
  median_f0 = max(median_f0, 1e-6)
51
  semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
52
+ pitch_variation = float(np.std(semitone_diffs))
53
 
54
  # Pause analysis
55
  total_speaking_time = 0.0
vers/vers.py CHANGED
@@ -22,7 +22,7 @@ def calc_ess(pitch_variation, vol_max_db, mean_volume_db, valence_scores):
22
  valence_stability = 100 - (np.std(valence_scores) * 20)
23
 
24
  ESS = (0.45 * float(tonal_steadiness)) + (0.35 * float(loudness_stability)) + (0.2 * float(valence_stability))
25
- print(f" tonal_steadiness: {tonal_steadiness}, loudness_stability: {loudness_stability}, valence_stability: {valence_stability}")
26
  return ESS
27
 
28
  def calc_lcs(volume_std, vol_max_db, mean_volume_db):
@@ -67,7 +67,7 @@ def calc_srs(wpm, filler_count, long_pause_count, pitch_variation):
67
 
68
  # Final SRS Score
69
  SRS = (0.45 * wpm_consistency) + (0.55 * stability)
70
- print(f"wpm_consistency: {wpm_consistency}, stability: {stability}")
71
  return min(100, max(0, SRS))
72
 
73
  def calc_vers(filler_count, long_pause_count, pitch_variation, mean_volume_db, vol_max_db, wpm, volume_std, valence_scores):
@@ -93,10 +93,10 @@ def calc_vers(filler_count, long_pause_count, pitch_variation, mean_volume_db, v
93
 
94
  return {
95
  "VERS": int(VERS),
96
- "ESS": round(ESS, 1),
97
- "LCS": round(LCS, 1),
98
- "SRS": round(SRS, 1),
99
- "insight": insight
100
  }
101
 
102
  # # Test input
 
22
  valence_stability = 100 - (np.std(valence_scores) * 20)
23
 
24
  ESS = (0.45 * float(tonal_steadiness)) + (0.35 * float(loudness_stability)) + (0.2 * float(valence_stability))
25
+ #print(f" tonal_steadiness: {tonal_steadiness}, loudness_stability: {loudness_stability}, valence_stability: {valence_stability}")
26
  return ESS
27
 
28
  def calc_lcs(volume_std, vol_max_db, mean_volume_db):
 
67
 
68
  # Final SRS Score
69
  SRS = (0.45 * wpm_consistency) + (0.55 * stability)
70
+ #print(f"wpm_consistency: {wpm_consistency}, stability: {stability}")
71
  return min(100, max(0, SRS))
72
 
73
  def calc_vers(filler_count, long_pause_count, pitch_variation, mean_volume_db, vol_max_db, wpm, volume_std, valence_scores):
 
93
 
94
  return {
95
  "VERS": int(VERS),
96
+ # "ESS": round(ESS, 1),
97
+ # "LCS": round(LCS, 1),
98
+ # "SRS": round(SRS, 1),
99
+ # "insight": insight
100
  }
101
 
102
  # # Test input
vers/vers_api.py CHANGED
@@ -17,13 +17,13 @@ def convert_numpy_types(obj):
17
  else:
18
  return obj
19
 
20
- def main(file_path: str, model_size: str = "base") -> dict:
21
  try:
22
  # Load whisper model
23
  whisper_model = whisper.load_model(model_size)
24
 
25
  # Compute VERS score
26
- results = compute_vers_score(file_path, whisper_model)
27
 
28
  # Convert any NumPy types to native Python types
29
  results = convert_numpy_types(results)
 
17
  else:
18
  return obj
19
 
20
+ def main(file_path: str, model_size: str = "base", filler_count = None) -> dict:
21
  try:
22
  # Load whisper model
23
  whisper_model = whisper.load_model(model_size)
24
 
25
  # Compute VERS score
26
+ results = compute_vers_score(file_path, whisper_model, filler_count)
27
 
28
  # Convert any NumPy types to native Python types
29
  results = convert_numpy_types(results)
voice_confidence_score/__pycache__/voice_confidence.cpython-312.pyc CHANGED
Binary files a/voice_confidence_score/__pycache__/voice_confidence.cpython-312.pyc and b/voice_confidence_score/__pycache__/voice_confidence.cpython-312.pyc differ
 
voice_confidence_score/__pycache__/voice_confidence_api.cpython-312.pyc CHANGED
Binary files a/voice_confidence_score/__pycache__/voice_confidence_api.cpython-312.pyc and b/voice_confidence_score/__pycache__/voice_confidence_api.cpython-312.pyc differ
 
voice_confidence_score/voice_confidence.py CHANGED
@@ -5,11 +5,11 @@ from fluency.compute_fluency import compute_fluency_score
5
  from vcs.compute_vcs import analyze_voice_quality
6
 
7
 
8
- def calc_fluency_score(audio_path, whisper_model):
9
 
10
  # Calculate fluency score
11
  print(f"Analyzing fluency for {audio_path}...")
12
- results = compute_fluency_score(audio_path, whisper_model)
13
  fluency_score = results['fluency_score']
14
 
15
  return fluency_score
@@ -26,9 +26,12 @@ def calc_vcs(audio_path, whisper_model):
26
 
27
  dominance = 5.6 # dummy for now i add later
28
 
29
- def calc_voice_confidence_score(audio_path, model):
30
 
31
- fluency_score = calc_fluency_score(audio_path, model)
 
 
 
32
  vcs = calc_vcs(audio_path, model)
33
 
34
  # Calculate voice confidence score
 
5
  from vcs.compute_vcs import analyze_voice_quality
6
 
7
 
8
+ def calc_fluency_score(audio_path, whisper_model, filler_count=None):
9
 
10
  # Calculate fluency score
11
  print(f"Analyzing fluency for {audio_path}...")
12
+ results = compute_fluency_score(audio_path, whisper_model, filler_count)
13
  fluency_score = results['fluency_score']
14
 
15
  return fluency_score
 
26
 
27
  dominance = 5.6 # dummy for now i add later
28
 
29
+ def calc_voice_confidence_score(audio_path, model, filler_count= None, fluency_score=None):
30
 
31
+ if fluency_score is None:
32
+ print(' No args passed Calling calc_fluency_score')
33
+ fluency_score = calc_fluency_score(audio_path, model, filler_count)
34
+
35
  vcs = calc_vcs(audio_path, model)
36
 
37
  # Calculate voice confidence score
voice_confidence_score/voice_confidence_api.py CHANGED
@@ -1,13 +1,13 @@
1
  import whisper
2
  from .voice_confidence import calc_voice_confidence_score
3
 
4
- def main(file_path: str, model_size: str = "base") -> dict:
5
  try:
6
  # Load the Whisper model
7
  whisper_model = whisper.load_model(model_size)
8
 
9
  # Calculate the voice confidence score
10
- result = calc_voice_confidence_score(file_path, whisper_model)
11
 
12
  # Return the result as a dictionary
13
  return {"voice_confidence_score": round(result, 2)}
 
1
  import whisper
2
  from .voice_confidence import calc_voice_confidence_score
3
 
4
+ def main(file_path: str, model_size: str = "base", filler_count = None, fluency_score = None) -> dict:
5
  try:
6
  # Load the Whisper model
7
  whisper_model = whisper.load_model(model_size)
8
 
9
  # Calculate the voice confidence score
10
+ result = calc_voice_confidence_score(file_path, whisper_model, filler_count, fluency_score)
11
 
12
  # Return the result as a dictionary
13
  return {"voice_confidence_score": round(result, 2)}
vps/__pycache__/compute_vps_score.cpython-312.pyc CHANGED
Binary files a/vps/__pycache__/compute_vps_score.cpython-312.pyc and b/vps/__pycache__/compute_vps_score.cpython-312.pyc differ
 
vps/__pycache__/vps.cpython-312.pyc CHANGED
Binary files a/vps/__pycache__/vps.cpython-312.pyc and b/vps/__pycache__/vps.cpython-312.pyc differ
 
vps/__pycache__/vps_api.cpython-312.pyc CHANGED
Binary files a/vps/__pycache__/vps_api.cpython-312.pyc and b/vps/__pycache__/vps_api.cpython-312.pyc differ
 
vps/compute_vps_score.py CHANGED
@@ -2,7 +2,9 @@ from .vps import calculate_vps # Your file where calc_srs, calculate_pas, calcu
2
  import librosa
3
  import numpy as np
4
  import math
5
- from .filler_analyzer import detect_fillers
 
 
6
 
7
  def compute_vps_score(file_path: str, whisper_model) -> dict:
8
  """
@@ -16,7 +18,7 @@ def compute_vps_score(file_path: str, whisper_model) -> dict:
16
  dict: A dictionary containing VPS, SRS, PAS, RCS, and component scores.
17
  """
18
  # Transcribe
19
- result = whisper_model.transcribe(file_path)
20
  transcript = result.get("text", "").strip()
21
  segments = result.get("segments", [])
22
 
@@ -25,7 +27,8 @@ def compute_vps_score(file_path: str, whisper_model) -> dict:
25
  raise ValueError("Empty transcript or segments from Whisper.")
26
 
27
  # Filler count
28
- filler_count, _ = detect_fillers(transcript)
 
29
 
30
  # Load audio
31
  y, sr = librosa.load(file_path, sr=None)
@@ -33,16 +36,20 @@ def compute_vps_score(file_path: str, whisper_model) -> dict:
33
  if duration <= 0:
34
  raise ValueError("Audio duration invalid or zero.")
35
 
36
- # Pitch variation (in semitones)
37
- f0, voiced_flags, voiced_probs = librosa.pyin(
38
- y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan)
39
- voiced_f0 = f0[~np.isnan(f0)]
 
 
 
 
40
  pitch_variation = 0.0
41
  if voiced_f0.size > 0:
42
- median_f0 = np.nanmedian(voiced_f0)
43
  median_f0 = max(median_f0, 1e-6)
44
  semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
45
- pitch_variation = float(np.nanstd(semitone_diffs))
46
 
47
  # Pause analysis
48
  long_pause_count = 0
@@ -74,6 +81,4 @@ def compute_vps_score(file_path: str, whisper_model) -> dict:
74
  sr=sr
75
  )
76
 
77
- # Include transcript optionally
78
- vps_result["transcript"] = transcript
79
  return vps_result
 
2
  import librosa
3
  import numpy as np
4
  import math
5
+
6
+ import pyworld
7
+ from filler_count.filler_score import analyze_fillers
8
 
9
  def compute_vps_score(file_path: str, whisper_model) -> dict:
10
  """
 
18
  dict: A dictionary containing VPS, SRS, PAS, RCS, and component scores.
19
  """
20
  # Transcribe
21
+ result = whisper_model.transcribe(file_path, word_timestamps=False, fp16=False)
22
  transcript = result.get("text", "").strip()
23
  segments = result.get("segments", [])
24
 
 
27
  raise ValueError("Empty transcript or segments from Whisper.")
28
 
29
  # Filler count
30
+ result = analyze_fillers(file_path,'base',transcript)
31
+ filler_count = result.get("filler_count", 0)
32
 
33
  # Load audio
34
  y, sr = librosa.load(file_path, sr=None)
 
36
  if duration <= 0:
37
  raise ValueError("Audio duration invalid or zero.")
38
 
39
+ # Calculate pitch variation (in semitones) using pyworld
40
+ _f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr)
41
+ f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr)
42
+ voiced_f0 = f0[f0 > 0]
43
+ voiced_f0 = voiced_f0[
44
+ (voiced_f0 > np.percentile(voiced_f0, 5)) &
45
+ (voiced_f0 < np.percentile(voiced_f0, 95))
46
+ ]
47
  pitch_variation = 0.0
48
  if voiced_f0.size > 0:
49
+ median_f0 = np.median(voiced_f0)
50
  median_f0 = max(median_f0, 1e-6)
51
  semitone_diffs = 12 * np.log2(voiced_f0 / median_f0)
52
+ pitch_variation = float(np.std(semitone_diffs))
53
 
54
  # Pause analysis
55
  long_pause_count = 0
 
81
  sr=sr
82
  )
83
 
 
 
84
  return vps_result