Coco-18 commited on
Commit
c0eb848
Β·
verified Β·
1 Parent(s): 6396296

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -53
app.py CHANGED
@@ -557,16 +557,18 @@ def calculate_similarity(text1, text2):
557
  matcher = SequenceMatcher(None, clean1, clean2)
558
  return matcher.ratio() * 100
559
 
560
-
561
  @app.route("/evaluate", methods=["POST"])
562
  def evaluate_pronunciation():
 
 
 
563
  if asr_model is None or asr_processor is None:
564
- logger.error("❌ Evaluation endpoint called but ASR models aren't loaded")
565
  return jsonify({"error": "ASR model not available"}), 503
566
 
567
  try:
568
  if "audio" not in request.files:
569
- logger.warning("⚠️ Evaluation request missing audio file")
570
  return jsonify({"error": "No audio file uploaded"}), 400
571
 
572
  audio_file = request.files["audio"]
@@ -575,64 +577,70 @@ def evaluate_pronunciation():
575
 
576
  # Validate reference locator
577
  if not reference_locator:
578
- logger.warning("⚠️ No reference locator provided")
579
  return jsonify({"error": "Reference locator is required"}), 400
580
 
581
  # Construct full reference directory path
582
  reference_dir = os.path.join(REFERENCE_AUDIO_DIR, reference_locator)
583
- logger.info(f"πŸ“ Reference directory path: {reference_dir}")
584
 
585
  if not os.path.exists(reference_dir):
586
- logger.warning(f"⚠️ Reference directory not found: {reference_dir}")
587
  return jsonify({"error": f"Reference audio directory not found: {reference_locator}"}), 404
588
 
589
  reference_files = glob.glob(os.path.join(reference_dir, "*.wav"))
590
- logger.info(f"πŸ“ Reference files found: {len(reference_files)}")
591
 
592
  if not reference_files:
593
- logger.warning(f"⚠️ No reference audio files found in {reference_dir}")
594
  return jsonify({"error": f"No reference audio found for {reference_locator}"}), 404
595
 
596
- # Log actual file paths for debugging
597
- for ref_file in reference_files:
598
- logger.debug(f"πŸ“ Reference file: {ref_file}")
599
-
600
  lang_code = LANGUAGE_CODES.get(language, language)
601
- logger.info(f"πŸ”„ Evaluating pronunciation for reference: {reference_locator} with language code: {lang_code}")
602
 
 
 
 
 
603
  # Save the uploaded file temporarily
604
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
605
- temp_audio.write(audio_file.read())
606
- user_audio_path = temp_audio.name
607
- logger.debug(f"πŸ“ User audio saved to {user_audio_path}")
608
 
609
  # Convert to WAV if necessary and ensure correct format
610
  try:
611
- logger.info(f"πŸ”„ Processing user audio file")
612
  # First try using pydub for consistent processing
613
  audio = AudioSegment.from_file(user_audio_path)
614
  audio = audio.set_frame_rate(SAMPLE_RATE).set_channels(1)
615
 
616
  # Save processed audio
617
- processed_path = os.path.join(OUTPUT_DIR, "processed_user_audio.wav")
618
  audio.export(processed_path, format="wav")
619
- logger.debug(f"πŸ“ Processed user audio saved to {processed_path}")
620
 
621
  # Load the processed audio for ASR
622
  user_waveform, sr = torchaudio.load(processed_path)
623
  user_waveform = user_waveform.squeeze().numpy()
624
- logger.info(f"βœ… User audio processed successfully: {sr}Hz, length: {len(user_waveform)} samples")
625
 
626
  # Update user_audio_path to processed file
627
  user_audio_path = processed_path
628
  except Exception as e:
629
- logger.error(f"❌ Audio processing failed: {str(e)}")
630
- logger.debug(f"Stack trace: {traceback.format_exc()}")
 
 
 
 
 
 
631
  return jsonify({"error": f"Audio processing failed: {str(e)}"}), 500
632
 
633
  # Transcribe user audio
634
  try:
635
- logger.info(f"πŸ”„ Transcribing user audio")
636
  # Process audio for ASR
637
  inputs = asr_processor(
638
  user_waveform,
@@ -648,9 +656,15 @@ def evaluate_pronunciation():
648
  ids = torch.argmax(logits, dim=-1)[0]
649
  user_transcription = asr_processor.decode(ids)
650
 
651
- logger.info(f"βœ… User transcription: '{user_transcription}'")
652
  except Exception as e:
653
- logger.error(f"❌ ASR inference failed: {str(e)}")
 
 
 
 
 
 
654
  return jsonify({"error": f"ASR inference failed: {str(e)}"}), 500
655
 
656
  # Compare with reference audios
@@ -659,22 +673,23 @@ def evaluate_pronunciation():
659
  best_reference = None
660
  best_transcription = None
661
 
662
- logger.info(f"πŸ”„ Beginning comparison with {len(reference_files)} reference files")
663
 
664
- for ref_file in reference_files:
665
  try:
666
- logger.info(f"πŸ”„ Processing reference file: {os.path.basename(ref_file)}")
 
667
 
668
  # Load reference audio using torchaudio instead of librosa
669
  ref_waveform, ref_sr = torchaudio.load(ref_file)
670
  if ref_sr != SAMPLE_RATE:
671
- logger.debug(f"πŸ”„ Resampling reference audio from {ref_sr}Hz to {SAMPLE_RATE}Hz")
672
  ref_waveform = torchaudio.transforms.Resample(ref_sr, SAMPLE_RATE)(ref_waveform)
673
  ref_waveform = ref_waveform.squeeze().numpy()
674
- logger.debug(f"βœ… Reference audio loaded: {len(ref_waveform)} samples")
675
 
676
  # Transcribe reference audio
677
- logger.debug(f"πŸ”„ Transcribing reference audio")
678
  inputs = asr_processor(
679
  ref_waveform,
680
  sampling_rate=SAMPLE_RATE,
@@ -687,40 +702,40 @@ def evaluate_pronunciation():
687
  logits = asr_model(**inputs).logits
688
  ids = torch.argmax(logits, dim=-1)[0]
689
  ref_transcription = asr_processor.decode(ids)
690
- logger.info(f"βœ… Reference transcription: '{ref_transcription}'")
691
 
692
  # Calculate similarity
693
  similarity = calculate_similarity(user_transcription, ref_transcription)
694
- logger.info(f"πŸ“Š Similarity with {os.path.basename(ref_file)}: {similarity:.2f}%")
695
 
696
  results.append({
697
- "reference_file": os.path.basename(ref_file),
698
  "reference_text": ref_transcription,
699
  "similarity_score": similarity
700
  })
701
 
702
  if similarity > best_score:
703
  best_score = similarity
704
- best_reference = os.path.basename(ref_file)
705
  best_transcription = ref_transcription
706
- logger.info(f"πŸ“Š New best match: {best_reference} with score {best_score:.2f}%")
707
 
708
  # Add this early exit condition here
709
  if similarity > 80.0: # If we find a really good match
710
- logger.info(f"🏁 Found excellent match (>80%). Stopping evaluation early.")
711
  break # Exit the loop early
712
 
713
  except Exception as e:
714
- logger.error(f"❌ Error processing reference audio {ref_file}: {str(e)}")
715
- logger.debug(f"Stack trace: {traceback.format_exc()}")
716
 
717
  # Clean up temp files
718
  try:
719
- if os.path.exists(user_audio_path) and user_audio_path != processed_path:
720
- os.unlink(user_audio_path)
721
- logger.debug(f"🧹 Cleaned up temporary file: {user_audio_path}")
722
  except Exception as e:
723
- logger.warning(f"⚠️ Failed to clean up temp files: {str(e)}")
724
 
725
  # Enhanced feedback based on score range
726
  is_correct = best_score >= 70.0
@@ -737,8 +752,9 @@ def evaluate_pronunciation():
737
  else:
738
  feedback = "Try again. Listen carefully to the sample pronunciation."
739
 
740
- logger.info(f"πŸ“Š Final evaluation results: score={best_score:.2f}%, is_correct={is_correct}")
741
- logger.info(f"πŸ“ Feedback: '{feedback}'")
 
742
 
743
  # Sort results by score descending
744
  results.sort(key=lambda x: x["similarity_score"], reverse=True)
@@ -754,8 +770,8 @@ def evaluate_pronunciation():
754
  })
755
 
756
  except Exception as e:
757
- logger.error(f"❌ Unhandled exception in evaluation endpoint: {str(e)}")
758
- logger.debug(f"Stack trace: {traceback.format_exc()}")
759
  return jsonify({"error": f"Internal server error: {str(e)}"}), 500
760
 
761
  @app.route("/upload_reference", methods=["POST"])
@@ -773,11 +789,6 @@ def upload_reference_audio():
773
  # Validate reference word
774
  reference_patterns = [
775
  "mayap_a_abak", "mayap_a_ugtu", "mayap_a_gatpanapun", "mayap_a_bengi", "komusta_ka", "malaus_ko_pu","malaus_kayu","agaganaka_da_ka", "pagdulapan_da_ka","kaluguran_da_ka","dakal_a_salamat","panapaya_mu_ku"
776
-
777
-
778
-
779
-
780
-
781
  ]
782
 
783
  if reference_word not in reference_patterns:
 
557
  matcher = SequenceMatcher(None, clean1, clean2)
558
  return matcher.ratio() * 100
559
 
 
560
  @app.route("/evaluate", methods=["POST"])
561
  def evaluate_pronunciation():
562
+ request_id = f"req-{id(request)}" # Create unique ID for this request
563
+ logger.info(f"[{request_id}] πŸ†• Starting new pronunciation evaluation request")
564
+
565
  if asr_model is None or asr_processor is None:
566
+ logger.error(f"[{request_id}] ❌ Evaluation endpoint called but ASR models aren't loaded")
567
  return jsonify({"error": "ASR model not available"}), 503
568
 
569
  try:
570
  if "audio" not in request.files:
571
+ logger.warning(f"[{request_id}] ⚠️ Evaluation request missing audio file")
572
  return jsonify({"error": "No audio file uploaded"}), 400
573
 
574
  audio_file = request.files["audio"]
 
577
 
578
  # Validate reference locator
579
  if not reference_locator:
580
+ logger.warning(f"[{request_id}] ⚠️ No reference locator provided")
581
  return jsonify({"error": "Reference locator is required"}), 400
582
 
583
  # Construct full reference directory path
584
  reference_dir = os.path.join(REFERENCE_AUDIO_DIR, reference_locator)
585
+ logger.info(f"[{request_id}] πŸ“ Reference directory path: {reference_dir}")
586
 
587
  if not os.path.exists(reference_dir):
588
+ logger.warning(f"[{request_id}] ⚠️ Reference directory not found: {reference_dir}")
589
  return jsonify({"error": f"Reference audio directory not found: {reference_locator}"}), 404
590
 
591
  reference_files = glob.glob(os.path.join(reference_dir, "*.wav"))
592
+ logger.info(f"[{request_id}] πŸ“ Found {len(reference_files)} reference files")
593
 
594
  if not reference_files:
595
+ logger.warning(f"[{request_id}] ⚠️ No reference audio files found in {reference_dir}")
596
  return jsonify({"error": f"No reference audio found for {reference_locator}"}), 404
597
 
 
 
 
 
598
  lang_code = LANGUAGE_CODES.get(language, language)
599
+ logger.info(f"[{request_id}] πŸ”„ Evaluating pronunciation for reference: {reference_locator} with language code: {lang_code}")
600
 
601
+ # Create a request-specific temp directory to avoid conflicts
602
+ temp_dir = os.path.join(OUTPUT_DIR, f"temp_{request_id}")
603
+ os.makedirs(temp_dir, exist_ok=True)
604
+
605
  # Save the uploaded file temporarily
606
+ user_audio_path = os.path.join(temp_dir, "user_audio_input.wav")
607
+ with open(user_audio_path, 'wb') as f:
608
+ f.write(audio_file.read())
609
+ logger.debug(f"[{request_id}] πŸ“ User audio saved to {user_audio_path}")
610
 
611
  # Convert to WAV if necessary and ensure correct format
612
  try:
613
+ logger.info(f"[{request_id}] πŸ”„ Processing user audio file")
614
  # First try using pydub for consistent processing
615
  audio = AudioSegment.from_file(user_audio_path)
616
  audio = audio.set_frame_rate(SAMPLE_RATE).set_channels(1)
617
 
618
  # Save processed audio
619
+ processed_path = os.path.join(temp_dir, "processed_user_audio.wav")
620
  audio.export(processed_path, format="wav")
621
+ logger.debug(f"[{request_id}] πŸ“ Processed user audio saved to {processed_path}")
622
 
623
  # Load the processed audio for ASR
624
  user_waveform, sr = torchaudio.load(processed_path)
625
  user_waveform = user_waveform.squeeze().numpy()
626
+ logger.info(f"[{request_id}] βœ… User audio processed successfully: {sr}Hz, length: {len(user_waveform)} samples")
627
 
628
  # Update user_audio_path to processed file
629
  user_audio_path = processed_path
630
  except Exception as e:
631
+ logger.error(f"[{request_id}] ❌ Audio processing failed: {str(e)}")
632
+ logger.debug(f"[{request_id}] Stack trace: {traceback.format_exc()}")
633
+ # Clean up temp directory
634
+ try:
635
+ import shutil
636
+ shutil.rmtree(temp_dir)
637
+ except:
638
+ pass
639
  return jsonify({"error": f"Audio processing failed: {str(e)}"}), 500
640
 
641
  # Transcribe user audio
642
  try:
643
+ logger.info(f"[{request_id}] πŸ”„ Transcribing user audio")
644
  # Process audio for ASR
645
  inputs = asr_processor(
646
  user_waveform,
 
656
  ids = torch.argmax(logits, dim=-1)[0]
657
  user_transcription = asr_processor.decode(ids)
658
 
659
+ logger.info(f"[{request_id}] βœ… User transcription: '{user_transcription}'")
660
  except Exception as e:
661
+ logger.error(f"[{request_id}] ❌ ASR inference failed: {str(e)}")
662
+ # Clean up temp directory
663
+ try:
664
+ import shutil
665
+ shutil.rmtree(temp_dir)
666
+ except:
667
+ pass
668
  return jsonify({"error": f"ASR inference failed: {str(e)}"}), 500
669
 
670
  # Compare with reference audios
 
673
  best_reference = None
674
  best_transcription = None
675
 
676
+ logger.info(f"[{request_id}] πŸ”„ Beginning comparison with {len(reference_files)} reference files")
677
 
678
+ for ref_idx, ref_file in enumerate(reference_files):
679
  try:
680
+ ref_filename = os.path.basename(ref_file)
681
+ logger.info(f"[{request_id}] πŸ”„ [{ref_idx+1}/{len(reference_files)}] Processing reference file: {ref_filename}")
682
 
683
  # Load reference audio using torchaudio instead of librosa
684
  ref_waveform, ref_sr = torchaudio.load(ref_file)
685
  if ref_sr != SAMPLE_RATE:
686
+ logger.debug(f"[{request_id}] πŸ”„ Resampling reference audio from {ref_sr}Hz to {SAMPLE_RATE}Hz")
687
  ref_waveform = torchaudio.transforms.Resample(ref_sr, SAMPLE_RATE)(ref_waveform)
688
  ref_waveform = ref_waveform.squeeze().numpy()
689
+ logger.debug(f"[{request_id}] βœ… Reference audio loaded: {len(ref_waveform)} samples")
690
 
691
  # Transcribe reference audio
692
+ logger.debug(f"[{request_id}] πŸ”„ Transcribing reference audio: {ref_filename}")
693
  inputs = asr_processor(
694
  ref_waveform,
695
  sampling_rate=SAMPLE_RATE,
 
702
  logits = asr_model(**inputs).logits
703
  ids = torch.argmax(logits, dim=-1)[0]
704
  ref_transcription = asr_processor.decode(ids)
705
+ logger.info(f"[{request_id}] βœ… Reference transcription for {ref_filename}: '{ref_transcription}'")
706
 
707
  # Calculate similarity
708
  similarity = calculate_similarity(user_transcription, ref_transcription)
709
+ logger.info(f"[{request_id}] πŸ“Š Similarity with {ref_filename}: {similarity:.2f}%")
710
 
711
  results.append({
712
+ "reference_file": ref_filename,
713
  "reference_text": ref_transcription,
714
  "similarity_score": similarity
715
  })
716
 
717
  if similarity > best_score:
718
  best_score = similarity
719
+ best_reference = ref_filename
720
  best_transcription = ref_transcription
721
+ logger.info(f"[{request_id}] πŸ“Š New best match: {best_reference} with score {best_score:.2f}%")
722
 
723
  # Add this early exit condition here
724
  if similarity > 80.0: # If we find a really good match
725
+ logger.info(f"[{request_id}] 🏁 Found excellent match (>80%). Stopping evaluation early.")
726
  break # Exit the loop early
727
 
728
  except Exception as e:
729
+ logger.error(f"[{request_id}] ❌ Error processing reference audio {ref_file}: {str(e)}")
730
+ logger.debug(f"[{request_id}] Stack trace: {traceback.format_exc()}")
731
 
732
  # Clean up temp files
733
  try:
734
+ import shutil
735
+ shutil.rmtree(temp_dir)
736
+ logger.debug(f"[{request_id}] 🧹 Cleaned up temporary directory: {temp_dir}")
737
  except Exception as e:
738
+ logger.warning(f"[{request_id}] ⚠️ Failed to clean up temp files: {str(e)}")
739
 
740
  # Enhanced feedback based on score range
741
  is_correct = best_score >= 70.0
 
752
  else:
753
  feedback = "Try again. Listen carefully to the sample pronunciation."
754
 
755
+ logger.info(f"[{request_id}] πŸ“Š Final evaluation results: score={best_score:.2f}%, is_correct={is_correct}")
756
+ logger.info(f"[{request_id}] πŸ“ Feedback: '{feedback}'")
757
+ logger.info(f"[{request_id}] βœ… Evaluation complete")
758
 
759
  # Sort results by score descending
760
  results.sort(key=lambda x: x["similarity_score"], reverse=True)
 
770
  })
771
 
772
  except Exception as e:
773
+ logger.error(f"[{request_id}] ❌ Unhandled exception in evaluation endpoint: {str(e)}")
774
+ logger.debug(f"[{request_id}] Stack trace: {traceback.format_exc()}")
775
  return jsonify({"error": f"Internal server error: {str(e)}"}), 500
776
 
777
  @app.route("/upload_reference", methods=["POST"])
 
789
  # Validate reference word
790
  reference_patterns = [
791
  "mayap_a_abak", "mayap_a_ugtu", "mayap_a_gatpanapun", "mayap_a_bengi", "komusta_ka", "malaus_ko_pu","malaus_kayu","agaganaka_da_ka", "pagdulapan_da_ka","kaluguran_da_ka","dakal_a_salamat","panapaya_mu_ku"
 
 
 
 
 
792
  ]
793
 
794
  if reference_word not in reference_patterns: