Coco-18 commited on
Commit
2f7060f
Β·
verified Β·
1 Parent(s): a4dd810

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +135 -91
app.py CHANGED
@@ -4,7 +4,6 @@ import sys
4
  import logging
5
  import traceback
6
 
7
-
8
  # Configure logging
9
  logging.basicConfig(
10
  level=logging.INFO,
@@ -47,7 +46,7 @@ try:
47
  from transformers import Wav2Vec2ForCTC, AutoProcessor, VitsModel, AutoTokenizer
48
  from transformers import MarianMTModel, MarianTokenizer
49
  from werkzeug.utils import secure_filename
50
-
51
  logger.info("βœ… All required libraries imported successfully")
52
  except ImportError as e:
53
  logger.critical(f"❌ Failed to import necessary libraries: {str(e)}")
@@ -77,7 +76,7 @@ try:
77
  cache_dir=cache_dirs["TRANSFORMERS_CACHE"]
78
  )
79
  logger.info("βœ… ASR processor loaded successfully")
80
-
81
  asr_model = Wav2Vec2ForCTC.from_pretrained(
82
  ASR_MODEL_ID,
83
  cache_dir=cache_dirs["TRANSFORMERS_CACHE"]
@@ -113,13 +112,13 @@ for lang, model_id in TTS_MODELS.items():
113
  logger.info(f"πŸ”„ Loading TTS model for {lang}: {model_id}")
114
  try:
115
  tts_processors[lang] = AutoTokenizer.from_pretrained(
116
- model_id,
117
  cache_dir=cache_dirs["TRANSFORMERS_CACHE"]
118
  )
119
  logger.info(f"βœ… {lang} TTS processor loaded")
120
-
121
  tts_models[lang] = VitsModel.from_pretrained(
122
- model_id,
123
  cache_dir=cache_dirs["TRANSFORMERS_CACHE"]
124
  )
125
  tts_models[lang].to(device)
@@ -135,7 +134,7 @@ TRANSLATION_MODELS = {
135
  "eng-pam": "Coco-18/opus-mt-en-pam",
136
  "tgl-eng": "Helsinki-NLP/opus-mt-tl-en",
137
  "eng-tgl": "Helsinki-NLP/opus-mt-en-tl",
138
- "phi": "Coco-18/opus-mt-phi"
139
  }
140
 
141
  logger.info(f"πŸ”„ Loading Translation model: {TRANSLATION_MODELS}")
@@ -146,14 +145,14 @@ translation_tokenizers = {}
146
 
147
  for model_key, model_id in TRANSLATION_MODELS.items():
148
  logger.info(f"πŸ”„ Loading Translation model: {model_id}")
149
-
150
  try:
151
  translation_tokenizers[model_key] = MarianTokenizer.from_pretrained(
152
  model_id,
153
  cache_dir=cache_dirs["TRANSFORMERS_CACHE"]
154
  )
155
  logger.info(f"βœ… Translation tokenizer loaded successfully for {model_key}")
156
-
157
  translation_models[model_key] = MarianMTModel.from_pretrained(
158
  model_id,
159
  cache_dir=cache_dirs["TRANSFORMERS_CACHE"]
@@ -169,7 +168,7 @@ for model_key, model_id in TRANSLATION_MODELS.items():
169
  # Constants
170
  SAMPLE_RATE = 16000
171
  OUTPUT_DIR = "/tmp/audio_outputs"
172
- REFERENCE_AUDIO_DIR = "./reference_audios"
173
 
174
  try:
175
  os.makedirs(OUTPUT_DIR, exist_ok=True)
@@ -177,43 +176,47 @@ try:
177
  except Exception as e:
178
  logger.error(f"❌ Failed to create output directory: {str(e)}")
179
 
 
180
  @app.route("/", methods=["GET"])
181
  def home():
182
  return jsonify({"message": "Speech API is running", "status": "active"})
183
 
 
184
  @app.route("/health", methods=["GET"])
185
  def health_check():
186
  # Initialize direct language pair statuses based on loaded models
187
  translation_status = {}
188
-
189
  # Add status for direct model pairs
190
  for lang_pair in ["pam-eng", "eng-pam", "tgl-eng", "eng-tgl"]:
191
- translation_status[lang_pair] = "loaded" if lang_pair in translation_models and translation_models[lang_pair] is not None else "failed"
192
-
 
193
  # Add special phi model status
194
  phi_status = "loaded" if "phi" in translation_models and translation_models["phi"] is not None else "failed"
195
  translation_status["pam-fil"] = phi_status
196
  translation_status["fil-pam"] = phi_status
197
  translation_status["pam-tgl"] = phi_status # Using phi model but replacing tgl with fil
198
  translation_status["tgl-pam"] = phi_status # Using phi model but replacing tgl with fil
199
-
200
  health_status = {
201
  "api_status": "online",
202
  "asr_model": "loaded" if asr_model is not None else "failed",
203
- "tts_models": {lang: "loaded" if model is not None else "failed"
204
- for lang, model in tts_models.items()},
205
  "translation_models": translation_status,
206
  "device": device
207
  }
208
  return jsonify(health_status)
209
 
 
210
  @app.route("/check_references", methods=["GET"])
211
  def check_references():
212
  """Endpoint to check if reference files exist and are accessible"""
213
- ref_patterns = ["mayap_a_abak", "mayap_a_ugtu", "mayap_a_gatpanapun",
214
  "mayap_a_bengi", "komusta_ka"]
215
  results = {}
216
-
217
  for pattern in ref_patterns:
218
  pattern_dir = os.path.join(REFERENCE_AUDIO_DIR, pattern)
219
  if os.path.exists(pattern_dir):
@@ -229,19 +232,20 @@ def check_references():
229
  "exists": False,
230
  "path": pattern_dir
231
  }
232
-
233
  return jsonify({
234
  "reference_audio_dir": REFERENCE_AUDIO_DIR,
235
  "directory_exists": os.path.exists(REFERENCE_AUDIO_DIR),
236
  "patterns": results
237
  })
238
 
 
239
  @app.route("/asr", methods=["POST"])
240
  def transcribe_audio():
241
  if asr_model is None or asr_processor is None:
242
  logger.error("❌ ASR endpoint called but models aren't loaded")
243
  return jsonify({"error": "ASR model not available"}), 503
244
-
245
  try:
246
  if "audio" not in request.files:
247
  logger.warning("⚠️ ASR request missing audio file")
@@ -252,7 +256,8 @@ def transcribe_audio():
252
 
253
  if language not in LANGUAGE_CODES:
254
  logger.warning(f"⚠️ Unsupported language requested: {language}")
255
- return jsonify({"error": f"Unsupported language: {language}. Available: {list(LANGUAGE_CODES.keys())}"}), 400
 
256
 
257
  lang_code = LANGUAGE_CODES[language]
258
  logger.info(f"πŸ”„ Processing {language} audio for ASR")
@@ -310,9 +315,9 @@ def transcribe_audio():
310
  logits = asr_model(**inputs).logits
311
  ids = torch.argmax(logits, dim=-1)[0]
312
  transcription = asr_processor.decode(ids)
313
-
314
  logger.info(f"βœ… Transcription ({language}): {transcription}")
315
-
316
  # Clean up temp files
317
  try:
318
  os.unlink(temp_audio_path)
@@ -320,7 +325,7 @@ def transcribe_audio():
320
  os.unlink(wav_path)
321
  except Exception as e:
322
  logger.warning(f"⚠️ Failed to clean up temp files: {str(e)}")
323
-
324
  return jsonify({
325
  "transcription": transcription,
326
  "language": language,
@@ -344,24 +349,24 @@ def generate_tts():
344
  if not data:
345
  logger.warning("⚠️ TTS endpoint called with no JSON data")
346
  return jsonify({"error": "No JSON data provided"}), 400
347
-
348
  text_input = data.get("text", "").strip()
349
  language = data.get("language", "kapampangan").lower()
350
 
351
  if not text_input:
352
  logger.warning("⚠️ TTS request with empty text")
353
  return jsonify({"error": "No text provided"}), 400
354
-
355
  if language not in TTS_MODELS:
356
  logger.warning(f"⚠️ TTS requested for unsupported language: {language}")
357
  return jsonify({"error": f"Invalid language. Available options: {list(TTS_MODELS.keys())}"}), 400
358
-
359
  if tts_models[language] is None:
360
  logger.error(f"❌ TTS model for {language} not loaded")
361
  return jsonify({"error": f"TTS model for {language} not available"}), 503
362
 
363
  logger.info(f"πŸ”„ Generating TTS for language: {language}, text: '{text_input}'")
364
-
365
  try:
366
  processor = tts_processors[language]
367
  model = tts_models[language]
@@ -409,10 +414,11 @@ def download_audio(filename):
409
  if os.path.exists(file_path):
410
  logger.info(f"πŸ“€ Serving audio file: {file_path}")
411
  return send_file(file_path, mimetype="audio/wav", as_attachment=True)
412
-
413
  logger.warning(f"⚠️ Requested file not found: {file_path}")
414
  return jsonify({"error": "File not found"}), 404
415
 
 
416
  @app.route("/translate", methods=["POST"])
417
  def translate_text():
418
  try:
@@ -420,7 +426,7 @@ def translate_text():
420
  if not data:
421
  logger.warning("⚠️ Translation endpoint called with no JSON data")
422
  return jsonify({"error": "No JSON data provided"}), 400
423
-
424
  source_text = data.get("text", "").strip()
425
  source_language = data.get("source_language", "").lower()
426
  target_language = data.get("target_language", "").lower()
@@ -428,18 +434,18 @@ def translate_text():
428
  if not source_text:
429
  logger.warning("⚠️ Translation request with empty text")
430
  return jsonify({"error": "No text provided"}), 400
431
-
432
  # Map language names to codes
433
  source_code = LANGUAGE_CODES.get(source_language, source_language)
434
  target_code = LANGUAGE_CODES.get(target_language, target_language)
435
-
436
  logger.info(f"πŸ”„ Translating from {source_language} to {target_language}: '{source_text}'")
437
-
438
  # Special handling for pam-fil, fil-pam, pam-tgl and tgl-pam using the phi model
439
  use_phi_model = False
440
  actual_source_code = source_code
441
  actual_target_code = target_code
442
-
443
  # Check if we need to use the phi model with fil replacement
444
  if (source_code == "pam" and target_code == "fil") or (source_code == "fil" and target_code == "pam"):
445
  use_phi_model = True
@@ -449,38 +455,38 @@ def translate_text():
449
  elif (source_code == "tgl" and target_code == "pam"):
450
  use_phi_model = True
451
  actual_source_code = "fil" # Replace tgl with fil for the phi model
452
-
453
  if use_phi_model:
454
  model_key = "phi"
455
-
456
  # Check if we have the phi model
457
  if model_key not in translation_models or translation_models[model_key] is None:
458
  logger.error(f"❌ Translation model for {model_key} not loaded")
459
  return jsonify({"error": f"Translation model not available"}), 503
460
-
461
  try:
462
  # Get the phi model and tokenizer
463
  model = translation_models[model_key]
464
  tokenizer = translation_tokenizers[model_key]
465
-
466
  # Prepend target language token to input
467
  input_text = f">>{actual_target_code}<< {source_text}"
468
-
469
  logger.info(f"πŸ”„ Using phi model with input: '{input_text}'")
470
-
471
  # Tokenize the text
472
  tokenized = tokenizer(input_text, return_tensors="pt", padding=True)
473
  tokenized = {k: v.to(device) for k, v in tokenized.items()}
474
-
475
  # Generate translation
476
  with torch.no_grad():
477
  translated = model.generate(**tokenized)
478
-
479
  # Decode the translation
480
  result = tokenizer.decode(translated[0], skip_special_tokens=True)
481
-
482
  logger.info(f"βœ… Translation result: '{result}'")
483
-
484
  return jsonify({
485
  "translated_text": result,
486
  "source_language": source_language,
@@ -493,34 +499,35 @@ def translate_text():
493
  else:
494
  # Create the regular language pair key for other language pairs
495
  lang_pair = f"{source_code}-{target_code}"
496
-
497
  # Check if we have a model for this language pair
498
  if lang_pair not in translation_models:
499
  logger.warning(f"⚠️ No translation model available for {lang_pair}")
500
- return jsonify({"error": f"Translation from {source_language} to {target_language} is not supported yet"}), 400
501
-
 
502
  if translation_models[lang_pair] is None or translation_tokenizers[lang_pair] is None:
503
  logger.error(f"❌ Translation model for {lang_pair} not loaded")
504
  return jsonify({"error": f"Translation model not available"}), 503
505
-
506
  try:
507
  # Regular translation process for other language pairs
508
  model = translation_models[lang_pair]
509
  tokenizer = translation_tokenizers[lang_pair]
510
-
511
  # Tokenize the text
512
  tokenized = tokenizer(source_text, return_tensors="pt", padding=True)
513
  tokenized = {k: v.to(device) for k, v in tokenized.items()}
514
-
515
  # Generate translation
516
  with torch.no_grad():
517
  translated = model.generate(**tokenized)
518
-
519
  # Decode the translation
520
  result = tokenizer.decode(translated[0], skip_special_tokens=True)
521
-
522
  logger.info(f"βœ… Translation result: '{result}'")
523
-
524
  return jsonify({
525
  "translated_text": result,
526
  "source_language": source_language,
@@ -530,30 +537,33 @@ def translate_text():
530
  logger.error(f"❌ Translation processing failed: {str(e)}")
531
  logger.debug(f"Stack trace: {traceback.format_exc()}")
532
  return jsonify({"error": f"Translation processing failed: {str(e)}"}), 500
533
-
534
  except Exception as e:
535
  logger.error(f"❌ Unhandled exception in translation endpoint: {str(e)}")
536
  logger.debug(f"Stack trace: {traceback.format_exc()}")
537
  return jsonify({"error": f"Internal server error: {str(e)}"}), 500
538
 
 
539
  # Add this function to your app.py
540
  def calculate_similarity(text1, text2):
541
  """Calculate text similarity percentage."""
 
542
  def clean_text(text):
543
  return text.lower()
544
-
545
  clean1 = clean_text(text1)
546
  clean2 = clean_text(text2)
547
-
548
  matcher = SequenceMatcher(None, clean1, clean2)
549
  return matcher.ratio() * 100
550
 
 
551
  @app.route("/evaluate", methods=["POST"])
552
  def evaluate_pronunciation():
553
  if asr_model is None or asr_processor is None:
554
  logger.error("❌ Evaluation endpoint called but ASR models aren't loaded")
555
  return jsonify({"error": "ASR model not available"}), 503
556
-
557
  try:
558
  if "audio" not in request.files:
559
  logger.warning("⚠️ Evaluation request missing audio file")
@@ -570,17 +580,25 @@ def evaluate_pronunciation():
570
 
571
  # Construct full reference directory path
572
  reference_dir = os.path.join(REFERENCE_AUDIO_DIR, reference_locator)
 
 
573
  if not os.path.exists(reference_dir):
574
  logger.warning(f"⚠️ Reference directory not found: {reference_dir}")
575
  return jsonify({"error": f"Reference audio directory not found: {reference_locator}"}), 404
576
 
577
  reference_files = glob.glob(os.path.join(reference_dir, "*.wav"))
 
 
578
  if not reference_files:
579
  logger.warning(f"⚠️ No reference audio files found in {reference_dir}")
580
  return jsonify({"error": f"No reference audio found for {reference_locator}"}), 404
581
 
 
 
 
 
582
  lang_code = LANGUAGE_CODES.get(language, language)
583
- logger.info(f"πŸ”„ Evaluating pronunciation for reference: {reference_locator}")
584
 
585
  # Save the uploaded file temporarily
586
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
@@ -590,22 +608,31 @@ def evaluate_pronunciation():
590
 
591
  # Convert to WAV if necessary and ensure correct format
592
  try:
593
- # Load audio with librosa for consistent processing
594
- user_waveform, sr = librosa.load(user_audio_path, sr=SAMPLE_RATE, mono=True)
595
-
 
 
596
  # Save processed audio
597
  processed_path = os.path.join(OUTPUT_DIR, "processed_user_audio.wav")
598
- sf.write(processed_path, user_waveform, SAMPLE_RATE)
599
  logger.debug(f"πŸ“ Processed user audio saved to {processed_path}")
600
-
 
 
 
 
 
601
  # Update user_audio_path to processed file
602
  user_audio_path = processed_path
603
  except Exception as e:
604
  logger.error(f"❌ Audio processing failed: {str(e)}")
 
605
  return jsonify({"error": f"Audio processing failed: {str(e)}"}), 500
606
 
607
  # Transcribe user audio
608
  try:
 
609
  # Process audio for ASR
610
  inputs = asr_processor(
611
  user_waveform,
@@ -614,14 +641,14 @@ def evaluate_pronunciation():
614
  language=lang_code
615
  )
616
  inputs = {k: v.to(device) for k, v in inputs.items()}
617
-
618
  # Perform ASR
619
  with torch.no_grad():
620
  logits = asr_model(**inputs).logits
621
  ids = torch.argmax(logits, dim=-1)[0]
622
  user_transcription = asr_processor.decode(ids)
623
-
624
- logger.info(f"βœ… User transcription: {user_transcription}")
625
  except Exception as e:
626
  logger.error(f"❌ ASR inference failed: {str(e)}")
627
  return jsonify({"error": f"ASR inference failed: {str(e)}"}), 500
@@ -631,13 +658,23 @@ def evaluate_pronunciation():
631
  best_score = 0
632
  best_reference = None
633
  best_transcription = None
634
-
 
 
635
  for ref_file in reference_files:
636
  try:
637
- # Load reference audio
638
- ref_waveform, _ = librosa.load(ref_file, sr=SAMPLE_RATE, mono=True)
639
-
 
 
 
 
 
 
 
640
  # Transcribe reference audio
 
641
  inputs = asr_processor(
642
  ref_waveform,
643
  sampling_rate=SAMPLE_RATE,
@@ -645,41 +682,44 @@ def evaluate_pronunciation():
645
  language=lang_code
646
  )
647
  inputs = {k: v.to(device) for k, v in inputs.items()}
648
-
649
  with torch.no_grad():
650
  logits = asr_model(**inputs).logits
651
  ids = torch.argmax(logits, dim=-1)[0]
652
  ref_transcription = asr_processor.decode(ids)
653
-
 
654
  # Calculate similarity
655
  similarity = calculate_similarity(user_transcription, ref_transcription)
656
-
 
657
  results.append({
658
  "reference_file": os.path.basename(ref_file),
659
  "reference_text": ref_transcription,
660
  "similarity_score": similarity
661
  })
662
-
663
  if similarity > best_score:
664
  best_score = similarity
665
  best_reference = os.path.basename(ref_file)
666
  best_transcription = ref_transcription
667
-
668
- logger.debug(f"πŸ“Š Reference '{os.path.basename(ref_file)}': {similarity:.2f}%")
669
  except Exception as e:
670
  logger.error(f"❌ Error processing reference audio {ref_file}: {str(e)}")
671
-
 
672
  # Clean up temp files
673
  try:
674
  if os.path.exists(user_audio_path) and user_audio_path != processed_path:
675
  os.unlink(user_audio_path)
 
676
  except Exception as e:
677
  logger.warning(f"⚠️ Failed to clean up temp files: {str(e)}")
678
-
679
  # Enhanced feedback based on score range
680
  is_correct = best_score >= 70.0
681
  feedback = ""
682
-
683
  if best_score >= 90.0:
684
  feedback = "Perfect pronunciation! Excellent job!"
685
  elif best_score >= 80.0:
@@ -690,10 +730,13 @@ def evaluate_pronunciation():
690
  feedback = "Fair attempt. Try focusing on the syllables that differ from the sample."
691
  else:
692
  feedback = "Try again. Listen carefully to the sample pronunciation."
693
-
 
 
 
694
  # Sort results by score descending
695
  results.sort(key=lambda x: x["similarity_score"], reverse=True)
696
-
697
  return jsonify({
698
  "is_correct": is_correct,
699
  "score": best_score,
@@ -703,7 +746,7 @@ def evaluate_pronunciation():
703
  "reference_locator": reference_locator,
704
  "details": results
705
  })
706
-
707
  except Exception as e:
708
  logger.error(f"❌ Unhandled exception in evaluation endpoint: {str(e)}")
709
  logger.debug(f"Stack trace: {traceback.format_exc()}")
@@ -723,10 +766,10 @@ def upload_reference_audio():
723
 
724
  # Validate reference word
725
  reference_patterns = [
726
- "mayap_a_abak", "mayap_a_ugtu", "mayap_a_gatpanapun",
727
  "mayap_a_bengi", "komusta_ka"
728
  ]
729
-
730
  if reference_word not in reference_patterns:
731
  logger.warning(f"⚠️ Invalid reference word: {reference_word}")
732
  return jsonify({"error": f"Invalid reference word. Available: {reference_patterns}"}), 400
@@ -771,21 +814,22 @@ def upload_reference_audio():
771
  logger.debug(f"Stack trace: {traceback.format_exc()}")
772
  return jsonify({"error": f"Internal server error: {str(e)}"}), 500
773
 
 
774
  def init_reference_audio():
775
  try:
776
  # Create the output directory first
777
  os.makedirs(OUTPUT_DIR, exist_ok=True)
778
  logger.info(f"πŸ“ Created output directory: {OUTPUT_DIR}")
779
-
780
  # Check if the reference audio directory exists in the repository
781
  if os.path.exists(REFERENCE_AUDIO_DIR):
782
  logger.info(f"βœ… Found reference audio directory: {REFERENCE_AUDIO_DIR}")
783
-
784
  # Log the contents to verify
785
- pattern_dirs = [d for d in os.listdir(REFERENCE_AUDIO_DIR)
786
- if os.path.isdir(os.path.join(REFERENCE_AUDIO_DIR, d))]
787
  logger.info(f"πŸ“ Found reference patterns: {pattern_dirs}")
788
-
789
  # Check each pattern directory for wav files
790
  for pattern_dir_name in pattern_dirs:
791
  pattern_path = os.path.join(REFERENCE_AUDIO_DIR, pattern_dir_name)
@@ -796,6 +840,7 @@ def init_reference_audio():
796
  except Exception as e:
797
  logger.error(f"❌ Failed to set up reference audio directory: {str(e)}")
798
 
 
799
  # Add an initialization route that will be called before the first request
800
  @app.before_request
801
  def before_request():
@@ -804,12 +849,11 @@ def before_request():
804
  g.initialized = True
805
 
806
 
807
-
808
  if __name__ == "__main__":
809
  init_reference_audio()
810
  logger.info("πŸš€ Starting Speech API server")
811
  logger.info(f"πŸ“Š System status: ASR model: {'βœ…' if asr_model else '❌'}")
812
  for lang, model in tts_models.items():
813
  logger.info(f"πŸ“Š TTS model {lang}: {'βœ…' if model else '❌'}")
814
-
815
  app.run(host="0.0.0.0", port=7860, debug=True)
 
4
  import logging
5
  import traceback
6
 
 
7
  # Configure logging
8
  logging.basicConfig(
9
  level=logging.INFO,
 
46
  from transformers import Wav2Vec2ForCTC, AutoProcessor, VitsModel, AutoTokenizer
47
  from transformers import MarianMTModel, MarianTokenizer
48
  from werkzeug.utils import secure_filename
49
+
50
  logger.info("βœ… All required libraries imported successfully")
51
  except ImportError as e:
52
  logger.critical(f"❌ Failed to import necessary libraries: {str(e)}")
 
76
  cache_dir=cache_dirs["TRANSFORMERS_CACHE"]
77
  )
78
  logger.info("βœ… ASR processor loaded successfully")
79
+
80
  asr_model = Wav2Vec2ForCTC.from_pretrained(
81
  ASR_MODEL_ID,
82
  cache_dir=cache_dirs["TRANSFORMERS_CACHE"]
 
112
  logger.info(f"πŸ”„ Loading TTS model for {lang}: {model_id}")
113
  try:
114
  tts_processors[lang] = AutoTokenizer.from_pretrained(
115
+ model_id,
116
  cache_dir=cache_dirs["TRANSFORMERS_CACHE"]
117
  )
118
  logger.info(f"βœ… {lang} TTS processor loaded")
119
+
120
  tts_models[lang] = VitsModel.from_pretrained(
121
+ model_id,
122
  cache_dir=cache_dirs["TRANSFORMERS_CACHE"]
123
  )
124
  tts_models[lang].to(device)
 
134
  "eng-pam": "Coco-18/opus-mt-en-pam",
135
  "tgl-eng": "Helsinki-NLP/opus-mt-tl-en",
136
  "eng-tgl": "Helsinki-NLP/opus-mt-en-tl",
137
+ "phi": "Coco-18/opus-mt-phi"
138
  }
139
 
140
  logger.info(f"πŸ”„ Loading Translation model: {TRANSLATION_MODELS}")
 
145
 
146
  for model_key, model_id in TRANSLATION_MODELS.items():
147
  logger.info(f"πŸ”„ Loading Translation model: {model_id}")
148
+
149
  try:
150
  translation_tokenizers[model_key] = MarianTokenizer.from_pretrained(
151
  model_id,
152
  cache_dir=cache_dirs["TRANSFORMERS_CACHE"]
153
  )
154
  logger.info(f"βœ… Translation tokenizer loaded successfully for {model_key}")
155
+
156
  translation_models[model_key] = MarianMTModel.from_pretrained(
157
  model_id,
158
  cache_dir=cache_dirs["TRANSFORMERS_CACHE"]
 
168
  # Constants
169
  SAMPLE_RATE = 16000
170
  OUTPUT_DIR = "/tmp/audio_outputs"
171
+ REFERENCE_AUDIO_DIR = "./reference_audios"
172
 
173
  try:
174
  os.makedirs(OUTPUT_DIR, exist_ok=True)
 
176
  except Exception as e:
177
  logger.error(f"❌ Failed to create output directory: {str(e)}")
178
 
179
+
180
  @app.route("/", methods=["GET"])
181
  def home():
182
  return jsonify({"message": "Speech API is running", "status": "active"})
183
 
184
+
185
  @app.route("/health", methods=["GET"])
186
  def health_check():
187
  # Initialize direct language pair statuses based on loaded models
188
  translation_status = {}
189
+
190
  # Add status for direct model pairs
191
  for lang_pair in ["pam-eng", "eng-pam", "tgl-eng", "eng-tgl"]:
192
+ translation_status[lang_pair] = "loaded" if lang_pair in translation_models and translation_models[
193
+ lang_pair] is not None else "failed"
194
+
195
  # Add special phi model status
196
  phi_status = "loaded" if "phi" in translation_models and translation_models["phi"] is not None else "failed"
197
  translation_status["pam-fil"] = phi_status
198
  translation_status["fil-pam"] = phi_status
199
  translation_status["pam-tgl"] = phi_status # Using phi model but replacing tgl with fil
200
  translation_status["tgl-pam"] = phi_status # Using phi model but replacing tgl with fil
201
+
202
  health_status = {
203
  "api_status": "online",
204
  "asr_model": "loaded" if asr_model is not None else "failed",
205
+ "tts_models": {lang: "loaded" if model is not None else "failed"
206
+ for lang, model in tts_models.items()},
207
  "translation_models": translation_status,
208
  "device": device
209
  }
210
  return jsonify(health_status)
211
 
212
+
213
  @app.route("/check_references", methods=["GET"])
214
  def check_references():
215
  """Endpoint to check if reference files exist and are accessible"""
216
+ ref_patterns = ["mayap_a_abak", "mayap_a_ugtu", "mayap_a_gatpanapun",
217
  "mayap_a_bengi", "komusta_ka"]
218
  results = {}
219
+
220
  for pattern in ref_patterns:
221
  pattern_dir = os.path.join(REFERENCE_AUDIO_DIR, pattern)
222
  if os.path.exists(pattern_dir):
 
232
  "exists": False,
233
  "path": pattern_dir
234
  }
235
+
236
  return jsonify({
237
  "reference_audio_dir": REFERENCE_AUDIO_DIR,
238
  "directory_exists": os.path.exists(REFERENCE_AUDIO_DIR),
239
  "patterns": results
240
  })
241
 
242
+
243
  @app.route("/asr", methods=["POST"])
244
  def transcribe_audio():
245
  if asr_model is None or asr_processor is None:
246
  logger.error("❌ ASR endpoint called but models aren't loaded")
247
  return jsonify({"error": "ASR model not available"}), 503
248
+
249
  try:
250
  if "audio" not in request.files:
251
  logger.warning("⚠️ ASR request missing audio file")
 
256
 
257
  if language not in LANGUAGE_CODES:
258
  logger.warning(f"⚠️ Unsupported language requested: {language}")
259
+ return jsonify(
260
+ {"error": f"Unsupported language: {language}. Available: {list(LANGUAGE_CODES.keys())}"}), 400
261
 
262
  lang_code = LANGUAGE_CODES[language]
263
  logger.info(f"πŸ”„ Processing {language} audio for ASR")
 
315
  logits = asr_model(**inputs).logits
316
  ids = torch.argmax(logits, dim=-1)[0]
317
  transcription = asr_processor.decode(ids)
318
+
319
  logger.info(f"βœ… Transcription ({language}): {transcription}")
320
+
321
  # Clean up temp files
322
  try:
323
  os.unlink(temp_audio_path)
 
325
  os.unlink(wav_path)
326
  except Exception as e:
327
  logger.warning(f"⚠️ Failed to clean up temp files: {str(e)}")
328
+
329
  return jsonify({
330
  "transcription": transcription,
331
  "language": language,
 
349
  if not data:
350
  logger.warning("⚠️ TTS endpoint called with no JSON data")
351
  return jsonify({"error": "No JSON data provided"}), 400
352
+
353
  text_input = data.get("text", "").strip()
354
  language = data.get("language", "kapampangan").lower()
355
 
356
  if not text_input:
357
  logger.warning("⚠️ TTS request with empty text")
358
  return jsonify({"error": "No text provided"}), 400
359
+
360
  if language not in TTS_MODELS:
361
  logger.warning(f"⚠️ TTS requested for unsupported language: {language}")
362
  return jsonify({"error": f"Invalid language. Available options: {list(TTS_MODELS.keys())}"}), 400
363
+
364
  if tts_models[language] is None:
365
  logger.error(f"❌ TTS model for {language} not loaded")
366
  return jsonify({"error": f"TTS model for {language} not available"}), 503
367
 
368
  logger.info(f"πŸ”„ Generating TTS for language: {language}, text: '{text_input}'")
369
+
370
  try:
371
  processor = tts_processors[language]
372
  model = tts_models[language]
 
414
  if os.path.exists(file_path):
415
  logger.info(f"πŸ“€ Serving audio file: {file_path}")
416
  return send_file(file_path, mimetype="audio/wav", as_attachment=True)
417
+
418
  logger.warning(f"⚠️ Requested file not found: {file_path}")
419
  return jsonify({"error": "File not found"}), 404
420
 
421
+
422
  @app.route("/translate", methods=["POST"])
423
  def translate_text():
424
  try:
 
426
  if not data:
427
  logger.warning("⚠️ Translation endpoint called with no JSON data")
428
  return jsonify({"error": "No JSON data provided"}), 400
429
+
430
  source_text = data.get("text", "").strip()
431
  source_language = data.get("source_language", "").lower()
432
  target_language = data.get("target_language", "").lower()
 
434
  if not source_text:
435
  logger.warning("⚠️ Translation request with empty text")
436
  return jsonify({"error": "No text provided"}), 400
437
+
438
  # Map language names to codes
439
  source_code = LANGUAGE_CODES.get(source_language, source_language)
440
  target_code = LANGUAGE_CODES.get(target_language, target_language)
441
+
442
  logger.info(f"πŸ”„ Translating from {source_language} to {target_language}: '{source_text}'")
443
+
444
  # Special handling for pam-fil, fil-pam, pam-tgl and tgl-pam using the phi model
445
  use_phi_model = False
446
  actual_source_code = source_code
447
  actual_target_code = target_code
448
+
449
  # Check if we need to use the phi model with fil replacement
450
  if (source_code == "pam" and target_code == "fil") or (source_code == "fil" and target_code == "pam"):
451
  use_phi_model = True
 
455
  elif (source_code == "tgl" and target_code == "pam"):
456
  use_phi_model = True
457
  actual_source_code = "fil" # Replace tgl with fil for the phi model
458
+
459
  if use_phi_model:
460
  model_key = "phi"
461
+
462
  # Check if we have the phi model
463
  if model_key not in translation_models or translation_models[model_key] is None:
464
  logger.error(f"❌ Translation model for {model_key} not loaded")
465
  return jsonify({"error": f"Translation model not available"}), 503
466
+
467
  try:
468
  # Get the phi model and tokenizer
469
  model = translation_models[model_key]
470
  tokenizer = translation_tokenizers[model_key]
471
+
472
  # Prepend target language token to input
473
  input_text = f">>{actual_target_code}<< {source_text}"
474
+
475
  logger.info(f"πŸ”„ Using phi model with input: '{input_text}'")
476
+
477
  # Tokenize the text
478
  tokenized = tokenizer(input_text, return_tensors="pt", padding=True)
479
  tokenized = {k: v.to(device) for k, v in tokenized.items()}
480
+
481
  # Generate translation
482
  with torch.no_grad():
483
  translated = model.generate(**tokenized)
484
+
485
  # Decode the translation
486
  result = tokenizer.decode(translated[0], skip_special_tokens=True)
487
+
488
  logger.info(f"βœ… Translation result: '{result}'")
489
+
490
  return jsonify({
491
  "translated_text": result,
492
  "source_language": source_language,
 
499
  else:
500
  # Create the regular language pair key for other language pairs
501
  lang_pair = f"{source_code}-{target_code}"
502
+
503
  # Check if we have a model for this language pair
504
  if lang_pair not in translation_models:
505
  logger.warning(f"⚠️ No translation model available for {lang_pair}")
506
+ return jsonify(
507
+ {"error": f"Translation from {source_language} to {target_language} is not supported yet"}), 400
508
+
509
  if translation_models[lang_pair] is None or translation_tokenizers[lang_pair] is None:
510
  logger.error(f"❌ Translation model for {lang_pair} not loaded")
511
  return jsonify({"error": f"Translation model not available"}), 503
512
+
513
  try:
514
  # Regular translation process for other language pairs
515
  model = translation_models[lang_pair]
516
  tokenizer = translation_tokenizers[lang_pair]
517
+
518
  # Tokenize the text
519
  tokenized = tokenizer(source_text, return_tensors="pt", padding=True)
520
  tokenized = {k: v.to(device) for k, v in tokenized.items()}
521
+
522
  # Generate translation
523
  with torch.no_grad():
524
  translated = model.generate(**tokenized)
525
+
526
  # Decode the translation
527
  result = tokenizer.decode(translated[0], skip_special_tokens=True)
528
+
529
  logger.info(f"βœ… Translation result: '{result}'")
530
+
531
  return jsonify({
532
  "translated_text": result,
533
  "source_language": source_language,
 
537
  logger.error(f"❌ Translation processing failed: {str(e)}")
538
  logger.debug(f"Stack trace: {traceback.format_exc()}")
539
  return jsonify({"error": f"Translation processing failed: {str(e)}"}), 500
540
+
541
  except Exception as e:
542
  logger.error(f"❌ Unhandled exception in translation endpoint: {str(e)}")
543
  logger.debug(f"Stack trace: {traceback.format_exc()}")
544
  return jsonify({"error": f"Internal server error: {str(e)}"}), 500
545
 
546
+
547
  # Add this function to your app.py
548
  def calculate_similarity(text1, text2):
549
  """Calculate text similarity percentage."""
550
+
551
  def clean_text(text):
552
  return text.lower()
553
+
554
  clean1 = clean_text(text1)
555
  clean2 = clean_text(text2)
556
+
557
  matcher = SequenceMatcher(None, clean1, clean2)
558
  return matcher.ratio() * 100
559
 
560
+
561
  @app.route("/evaluate", methods=["POST"])
562
  def evaluate_pronunciation():
563
  if asr_model is None or asr_processor is None:
564
  logger.error("❌ Evaluation endpoint called but ASR models aren't loaded")
565
  return jsonify({"error": "ASR model not available"}), 503
566
+
567
  try:
568
  if "audio" not in request.files:
569
  logger.warning("⚠️ Evaluation request missing audio file")
 
580
 
581
  # Construct full reference directory path
582
  reference_dir = os.path.join(REFERENCE_AUDIO_DIR, reference_locator)
583
+ logger.info(f"πŸ“ Reference directory path: {reference_dir}")
584
+
585
  if not os.path.exists(reference_dir):
586
  logger.warning(f"⚠️ Reference directory not found: {reference_dir}")
587
  return jsonify({"error": f"Reference audio directory not found: {reference_locator}"}), 404
588
 
589
  reference_files = glob.glob(os.path.join(reference_dir, "*.wav"))
590
+ logger.info(f"πŸ“ Reference files found: {len(reference_files)}")
591
+
592
  if not reference_files:
593
  logger.warning(f"⚠️ No reference audio files found in {reference_dir}")
594
  return jsonify({"error": f"No reference audio found for {reference_locator}"}), 404
595
 
596
+ # Log actual file paths for debugging
597
+ for ref_file in reference_files:
598
+ logger.debug(f"πŸ“ Reference file: {ref_file}")
599
+
600
  lang_code = LANGUAGE_CODES.get(language, language)
601
+ logger.info(f"πŸ”„ Evaluating pronunciation for reference: {reference_locator} with language code: {lang_code}")
602
 
603
  # Save the uploaded file temporarily
604
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
 
608
 
609
  # Convert to WAV if necessary and ensure correct format
610
  try:
611
+ logger.info(f"πŸ”„ Processing user audio file")
612
+ # First try using pydub for consistent processing
613
+ audio = AudioSegment.from_file(user_audio_path)
614
+ audio = audio.set_frame_rate(SAMPLE_RATE).set_channels(1)
615
+
616
  # Save processed audio
617
  processed_path = os.path.join(OUTPUT_DIR, "processed_user_audio.wav")
618
+ audio.export(processed_path, format="wav")
619
  logger.debug(f"πŸ“ Processed user audio saved to {processed_path}")
620
+
621
+ # Load the processed audio for ASR
622
+ user_waveform, sr = torchaudio.load(processed_path)
623
+ user_waveform = user_waveform.squeeze().numpy()
624
+ logger.info(f"βœ… User audio processed successfully: {sr}Hz, length: {len(user_waveform)} samples")
625
+
626
  # Update user_audio_path to processed file
627
  user_audio_path = processed_path
628
  except Exception as e:
629
  logger.error(f"❌ Audio processing failed: {str(e)}")
630
+ logger.debug(f"Stack trace: {traceback.format_exc()}")
631
  return jsonify({"error": f"Audio processing failed: {str(e)}"}), 500
632
 
633
  # Transcribe user audio
634
  try:
635
+ logger.info(f"πŸ”„ Transcribing user audio")
636
  # Process audio for ASR
637
  inputs = asr_processor(
638
  user_waveform,
 
641
  language=lang_code
642
  )
643
  inputs = {k: v.to(device) for k, v in inputs.items()}
644
+
645
  # Perform ASR
646
  with torch.no_grad():
647
  logits = asr_model(**inputs).logits
648
  ids = torch.argmax(logits, dim=-1)[0]
649
  user_transcription = asr_processor.decode(ids)
650
+
651
+ logger.info(f"βœ… User transcription: '{user_transcription}'")
652
  except Exception as e:
653
  logger.error(f"❌ ASR inference failed: {str(e)}")
654
  return jsonify({"error": f"ASR inference failed: {str(e)}"}), 500
 
658
  best_score = 0
659
  best_reference = None
660
  best_transcription = None
661
+
662
+ logger.info(f"πŸ”„ Beginning comparison with {len(reference_files)} reference files")
663
+
664
  for ref_file in reference_files:
665
  try:
666
+ logger.info(f"πŸ”„ Processing reference file: {os.path.basename(ref_file)}")
667
+
668
+ # Load reference audio using torchaudio instead of librosa
669
+ ref_waveform, ref_sr = torchaudio.load(ref_file)
670
+ if ref_sr != SAMPLE_RATE:
671
+ logger.debug(f"πŸ”„ Resampling reference audio from {ref_sr}Hz to {SAMPLE_RATE}Hz")
672
+ ref_waveform = torchaudio.transforms.Resample(ref_sr, SAMPLE_RATE)(ref_waveform)
673
+ ref_waveform = ref_waveform.squeeze().numpy()
674
+ logger.debug(f"βœ… Reference audio loaded: {len(ref_waveform)} samples")
675
+
676
  # Transcribe reference audio
677
+ logger.debug(f"πŸ”„ Transcribing reference audio")
678
  inputs = asr_processor(
679
  ref_waveform,
680
  sampling_rate=SAMPLE_RATE,
 
682
  language=lang_code
683
  )
684
  inputs = {k: v.to(device) for k, v in inputs.items()}
685
+
686
  with torch.no_grad():
687
  logits = asr_model(**inputs).logits
688
  ids = torch.argmax(logits, dim=-1)[0]
689
  ref_transcription = asr_processor.decode(ids)
690
+ logger.info(f"βœ… Reference transcription: '{ref_transcription}'")
691
+
692
  # Calculate similarity
693
  similarity = calculate_similarity(user_transcription, ref_transcription)
694
+ logger.info(f"πŸ“Š Similarity with {os.path.basename(ref_file)}: {similarity:.2f}%")
695
+
696
  results.append({
697
  "reference_file": os.path.basename(ref_file),
698
  "reference_text": ref_transcription,
699
  "similarity_score": similarity
700
  })
701
+
702
  if similarity > best_score:
703
  best_score = similarity
704
  best_reference = os.path.basename(ref_file)
705
  best_transcription = ref_transcription
706
+ logger.info(f"πŸ“Š New best match: {best_reference} with score {best_score:.2f}%")
 
707
  except Exception as e:
708
  logger.error(f"❌ Error processing reference audio {ref_file}: {str(e)}")
709
+ logger.debug(f"Stack trace: {traceback.format_exc()}")
710
+
711
  # Clean up temp files
712
  try:
713
  if os.path.exists(user_audio_path) and user_audio_path != processed_path:
714
  os.unlink(user_audio_path)
715
+ logger.debug(f"🧹 Cleaned up temporary file: {user_audio_path}")
716
  except Exception as e:
717
  logger.warning(f"⚠️ Failed to clean up temp files: {str(e)}")
718
+
719
  # Enhanced feedback based on score range
720
  is_correct = best_score >= 70.0
721
  feedback = ""
722
+
723
  if best_score >= 90.0:
724
  feedback = "Perfect pronunciation! Excellent job!"
725
  elif best_score >= 80.0:
 
730
  feedback = "Fair attempt. Try focusing on the syllables that differ from the sample."
731
  else:
732
  feedback = "Try again. Listen carefully to the sample pronunciation."
733
+
734
+ logger.info(f"πŸ“Š Final evaluation results: score={best_score:.2f}%, is_correct={is_correct}")
735
+ logger.info(f"πŸ“ Feedback: '{feedback}'")
736
+
737
  # Sort results by score descending
738
  results.sort(key=lambda x: x["similarity_score"], reverse=True)
739
+
740
  return jsonify({
741
  "is_correct": is_correct,
742
  "score": best_score,
 
746
  "reference_locator": reference_locator,
747
  "details": results
748
  })
749
+
750
  except Exception as e:
751
  logger.error(f"❌ Unhandled exception in evaluation endpoint: {str(e)}")
752
  logger.debug(f"Stack trace: {traceback.format_exc()}")
 
766
 
767
  # Validate reference word
768
  reference_patterns = [
769
+ "mayap_a_abak", "mayap_a_ugtu", "mayap_a_gatpanapun",
770
  "mayap_a_bengi", "komusta_ka"
771
  ]
772
+
773
  if reference_word not in reference_patterns:
774
  logger.warning(f"⚠️ Invalid reference word: {reference_word}")
775
  return jsonify({"error": f"Invalid reference word. Available: {reference_patterns}"}), 400
 
814
  logger.debug(f"Stack trace: {traceback.format_exc()}")
815
  return jsonify({"error": f"Internal server error: {str(e)}"}), 500
816
 
817
+
818
  def init_reference_audio():
819
  try:
820
  # Create the output directory first
821
  os.makedirs(OUTPUT_DIR, exist_ok=True)
822
  logger.info(f"πŸ“ Created output directory: {OUTPUT_DIR}")
823
+
824
  # Check if the reference audio directory exists in the repository
825
  if os.path.exists(REFERENCE_AUDIO_DIR):
826
  logger.info(f"βœ… Found reference audio directory: {REFERENCE_AUDIO_DIR}")
827
+
828
  # Log the contents to verify
829
+ pattern_dirs = [d for d in os.listdir(REFERENCE_AUDIO_DIR)
830
+ if os.path.isdir(os.path.join(REFERENCE_AUDIO_DIR, d))]
831
  logger.info(f"πŸ“ Found reference patterns: {pattern_dirs}")
832
+
833
  # Check each pattern directory for wav files
834
  for pattern_dir_name in pattern_dirs:
835
  pattern_path = os.path.join(REFERENCE_AUDIO_DIR, pattern_dir_name)
 
840
  except Exception as e:
841
  logger.error(f"❌ Failed to set up reference audio directory: {str(e)}")
842
 
843
+
844
  # Add an initialization route that will be called before the first request
845
  @app.before_request
846
  def before_request():
 
849
  g.initialized = True
850
 
851
 
 
852
  if __name__ == "__main__":
853
  init_reference_audio()
854
  logger.info("πŸš€ Starting Speech API server")
855
  logger.info(f"πŸ“Š System status: ASR model: {'βœ…' if asr_model else '❌'}")
856
  for lang, model in tts_models.items():
857
  logger.info(f"πŸ“Š TTS model {lang}: {'βœ…' if model else '❌'}")
858
+
859
  app.run(host="0.0.0.0", port=7860, debug=True)