Update app.py
Browse files
app.py
CHANGED
@@ -557,16 +557,18 @@ def calculate_similarity(text1, text2):
|
|
557 |
matcher = SequenceMatcher(None, clean1, clean2)
|
558 |
return matcher.ratio() * 100
|
559 |
|
560 |
-
|
561 |
@app.route("/evaluate", methods=["POST"])
|
562 |
def evaluate_pronunciation():
|
|
|
|
|
|
|
563 |
if asr_model is None or asr_processor is None:
|
564 |
-
logger.error("β Evaluation endpoint called but ASR models aren't loaded")
|
565 |
return jsonify({"error": "ASR model not available"}), 503
|
566 |
|
567 |
try:
|
568 |
if "audio" not in request.files:
|
569 |
-
logger.warning("β οΈ Evaluation request missing audio file")
|
570 |
return jsonify({"error": "No audio file uploaded"}), 400
|
571 |
|
572 |
audio_file = request.files["audio"]
|
@@ -575,64 +577,70 @@ def evaluate_pronunciation():
|
|
575 |
|
576 |
# Validate reference locator
|
577 |
if not reference_locator:
|
578 |
-
logger.warning("β οΈ No reference locator provided")
|
579 |
return jsonify({"error": "Reference locator is required"}), 400
|
580 |
|
581 |
# Construct full reference directory path
|
582 |
reference_dir = os.path.join(REFERENCE_AUDIO_DIR, reference_locator)
|
583 |
-
logger.info(f"π Reference directory path: {reference_dir}")
|
584 |
|
585 |
if not os.path.exists(reference_dir):
|
586 |
-
logger.warning(f"β οΈ Reference directory not found: {reference_dir}")
|
587 |
return jsonify({"error": f"Reference audio directory not found: {reference_locator}"}), 404
|
588 |
|
589 |
reference_files = glob.glob(os.path.join(reference_dir, "*.wav"))
|
590 |
-
logger.info(f"π
|
591 |
|
592 |
if not reference_files:
|
593 |
-
logger.warning(f"β οΈ No reference audio files found in {reference_dir}")
|
594 |
return jsonify({"error": f"No reference audio found for {reference_locator}"}), 404
|
595 |
|
596 |
-
# Log actual file paths for debugging
|
597 |
-
for ref_file in reference_files:
|
598 |
-
logger.debug(f"π Reference file: {ref_file}")
|
599 |
-
|
600 |
lang_code = LANGUAGE_CODES.get(language, language)
|
601 |
-
logger.info(f"π Evaluating pronunciation for reference: {reference_locator} with language code: {lang_code}")
|
602 |
|
|
|
|
|
|
|
|
|
603 |
# Save the uploaded file temporarily
|
604 |
-
|
605 |
-
|
606 |
-
|
607 |
-
|
608 |
|
609 |
# Convert to WAV if necessary and ensure correct format
|
610 |
try:
|
611 |
-
logger.info(f"π Processing user audio file")
|
612 |
# First try using pydub for consistent processing
|
613 |
audio = AudioSegment.from_file(user_audio_path)
|
614 |
audio = audio.set_frame_rate(SAMPLE_RATE).set_channels(1)
|
615 |
|
616 |
# Save processed audio
|
617 |
-
processed_path = os.path.join(
|
618 |
audio.export(processed_path, format="wav")
|
619 |
-
logger.debug(f"π Processed user audio saved to {processed_path}")
|
620 |
|
621 |
# Load the processed audio for ASR
|
622 |
user_waveform, sr = torchaudio.load(processed_path)
|
623 |
user_waveform = user_waveform.squeeze().numpy()
|
624 |
-
logger.info(f"β
User audio processed successfully: {sr}Hz, length: {len(user_waveform)} samples")
|
625 |
|
626 |
# Update user_audio_path to processed file
|
627 |
user_audio_path = processed_path
|
628 |
except Exception as e:
|
629 |
-
logger.error(f"β Audio processing failed: {str(e)}")
|
630 |
-
logger.debug(f"Stack trace: {traceback.format_exc()}")
|
|
|
|
|
|
|
|
|
|
|
|
|
631 |
return jsonify({"error": f"Audio processing failed: {str(e)}"}), 500
|
632 |
|
633 |
# Transcribe user audio
|
634 |
try:
|
635 |
-
logger.info(f"π Transcribing user audio")
|
636 |
# Process audio for ASR
|
637 |
inputs = asr_processor(
|
638 |
user_waveform,
|
@@ -648,9 +656,15 @@ def evaluate_pronunciation():
|
|
648 |
ids = torch.argmax(logits, dim=-1)[0]
|
649 |
user_transcription = asr_processor.decode(ids)
|
650 |
|
651 |
-
logger.info(f"β
User transcription: '{user_transcription}'")
|
652 |
except Exception as e:
|
653 |
-
logger.error(f"β ASR inference failed: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
654 |
return jsonify({"error": f"ASR inference failed: {str(e)}"}), 500
|
655 |
|
656 |
# Compare with reference audios
|
@@ -659,22 +673,23 @@ def evaluate_pronunciation():
|
|
659 |
best_reference = None
|
660 |
best_transcription = None
|
661 |
|
662 |
-
logger.info(f"π Beginning comparison with {len(reference_files)} reference files")
|
663 |
|
664 |
-
for ref_file in reference_files:
|
665 |
try:
|
666 |
-
|
|
|
667 |
|
668 |
# Load reference audio using torchaudio instead of librosa
|
669 |
ref_waveform, ref_sr = torchaudio.load(ref_file)
|
670 |
if ref_sr != SAMPLE_RATE:
|
671 |
-
logger.debug(f"π Resampling reference audio from {ref_sr}Hz to {SAMPLE_RATE}Hz")
|
672 |
ref_waveform = torchaudio.transforms.Resample(ref_sr, SAMPLE_RATE)(ref_waveform)
|
673 |
ref_waveform = ref_waveform.squeeze().numpy()
|
674 |
-
logger.debug(f"β
Reference audio loaded: {len(ref_waveform)} samples")
|
675 |
|
676 |
# Transcribe reference audio
|
677 |
-
logger.debug(f"π Transcribing reference audio")
|
678 |
inputs = asr_processor(
|
679 |
ref_waveform,
|
680 |
sampling_rate=SAMPLE_RATE,
|
@@ -687,40 +702,40 @@ def evaluate_pronunciation():
|
|
687 |
logits = asr_model(**inputs).logits
|
688 |
ids = torch.argmax(logits, dim=-1)[0]
|
689 |
ref_transcription = asr_processor.decode(ids)
|
690 |
-
logger.info(f"β
Reference transcription: '{ref_transcription}'")
|
691 |
|
692 |
# Calculate similarity
|
693 |
similarity = calculate_similarity(user_transcription, ref_transcription)
|
694 |
-
logger.info(f"π Similarity with {
|
695 |
|
696 |
results.append({
|
697 |
-
"reference_file":
|
698 |
"reference_text": ref_transcription,
|
699 |
"similarity_score": similarity
|
700 |
})
|
701 |
|
702 |
if similarity > best_score:
|
703 |
best_score = similarity
|
704 |
-
best_reference =
|
705 |
best_transcription = ref_transcription
|
706 |
-
logger.info(f"π New best match: {best_reference} with score {best_score:.2f}%")
|
707 |
|
708 |
# Add this early exit condition here
|
709 |
if similarity > 80.0: # If we find a really good match
|
710 |
-
logger.info(f"π Found excellent match (>80%). Stopping evaluation early.")
|
711 |
break # Exit the loop early
|
712 |
|
713 |
except Exception as e:
|
714 |
-
logger.error(f"β Error processing reference audio {ref_file}: {str(e)}")
|
715 |
-
logger.debug(f"Stack trace: {traceback.format_exc()}")
|
716 |
|
717 |
# Clean up temp files
|
718 |
try:
|
719 |
-
|
720 |
-
|
721 |
-
|
722 |
except Exception as e:
|
723 |
-
logger.warning(f"β οΈ Failed to clean up temp files: {str(e)}")
|
724 |
|
725 |
# Enhanced feedback based on score range
|
726 |
is_correct = best_score >= 70.0
|
@@ -737,8 +752,9 @@ def evaluate_pronunciation():
|
|
737 |
else:
|
738 |
feedback = "Try again. Listen carefully to the sample pronunciation."
|
739 |
|
740 |
-
logger.info(f"π Final evaluation results: score={best_score:.2f}%, is_correct={is_correct}")
|
741 |
-
logger.info(f"π Feedback: '{feedback}'")
|
|
|
742 |
|
743 |
# Sort results by score descending
|
744 |
results.sort(key=lambda x: x["similarity_score"], reverse=True)
|
@@ -754,8 +770,8 @@ def evaluate_pronunciation():
|
|
754 |
})
|
755 |
|
756 |
except Exception as e:
|
757 |
-
logger.error(f"β Unhandled exception in evaluation endpoint: {str(e)}")
|
758 |
-
logger.debug(f"Stack trace: {traceback.format_exc()}")
|
759 |
return jsonify({"error": f"Internal server error: {str(e)}"}), 500
|
760 |
|
761 |
@app.route("/upload_reference", methods=["POST"])
|
@@ -773,11 +789,6 @@ def upload_reference_audio():
|
|
773 |
# Validate reference word
|
774 |
reference_patterns = [
|
775 |
"mayap_a_abak", "mayap_a_ugtu", "mayap_a_gatpanapun", "mayap_a_bengi", "komusta_ka", "malaus_ko_pu","malaus_kayu","agaganaka_da_ka", "pagdulapan_da_ka","kaluguran_da_ka","dakal_a_salamat","panapaya_mu_ku"
|
776 |
-
|
777 |
-
|
778 |
-
|
779 |
-
|
780 |
-
|
781 |
]
|
782 |
|
783 |
if reference_word not in reference_patterns:
|
|
|
557 |
matcher = SequenceMatcher(None, clean1, clean2)
|
558 |
return matcher.ratio() * 100
|
559 |
|
|
|
560 |
@app.route("/evaluate", methods=["POST"])
|
561 |
def evaluate_pronunciation():
|
562 |
+
request_id = f"req-{id(request)}" # Create unique ID for this request
|
563 |
+
logger.info(f"[{request_id}] π Starting new pronunciation evaluation request")
|
564 |
+
|
565 |
if asr_model is None or asr_processor is None:
|
566 |
+
logger.error(f"[{request_id}] β Evaluation endpoint called but ASR models aren't loaded")
|
567 |
return jsonify({"error": "ASR model not available"}), 503
|
568 |
|
569 |
try:
|
570 |
if "audio" not in request.files:
|
571 |
+
logger.warning(f"[{request_id}] β οΈ Evaluation request missing audio file")
|
572 |
return jsonify({"error": "No audio file uploaded"}), 400
|
573 |
|
574 |
audio_file = request.files["audio"]
|
|
|
577 |
|
578 |
# Validate reference locator
|
579 |
if not reference_locator:
|
580 |
+
logger.warning(f"[{request_id}] β οΈ No reference locator provided")
|
581 |
return jsonify({"error": "Reference locator is required"}), 400
|
582 |
|
583 |
# Construct full reference directory path
|
584 |
reference_dir = os.path.join(REFERENCE_AUDIO_DIR, reference_locator)
|
585 |
+
logger.info(f"[{request_id}] π Reference directory path: {reference_dir}")
|
586 |
|
587 |
if not os.path.exists(reference_dir):
|
588 |
+
logger.warning(f"[{request_id}] β οΈ Reference directory not found: {reference_dir}")
|
589 |
return jsonify({"error": f"Reference audio directory not found: {reference_locator}"}), 404
|
590 |
|
591 |
reference_files = glob.glob(os.path.join(reference_dir, "*.wav"))
|
592 |
+
logger.info(f"[{request_id}] π Found {len(reference_files)} reference files")
|
593 |
|
594 |
if not reference_files:
|
595 |
+
logger.warning(f"[{request_id}] β οΈ No reference audio files found in {reference_dir}")
|
596 |
return jsonify({"error": f"No reference audio found for {reference_locator}"}), 404
|
597 |
|
|
|
|
|
|
|
|
|
598 |
lang_code = LANGUAGE_CODES.get(language, language)
|
599 |
+
logger.info(f"[{request_id}] π Evaluating pronunciation for reference: {reference_locator} with language code: {lang_code}")
|
600 |
|
601 |
+
# Create a request-specific temp directory to avoid conflicts
|
602 |
+
temp_dir = os.path.join(OUTPUT_DIR, f"temp_{request_id}")
|
603 |
+
os.makedirs(temp_dir, exist_ok=True)
|
604 |
+
|
605 |
# Save the uploaded file temporarily
|
606 |
+
user_audio_path = os.path.join(temp_dir, "user_audio_input.wav")
|
607 |
+
with open(user_audio_path, 'wb') as f:
|
608 |
+
f.write(audio_file.read())
|
609 |
+
logger.debug(f"[{request_id}] π User audio saved to {user_audio_path}")
|
610 |
|
611 |
# Convert to WAV if necessary and ensure correct format
|
612 |
try:
|
613 |
+
logger.info(f"[{request_id}] π Processing user audio file")
|
614 |
# First try using pydub for consistent processing
|
615 |
audio = AudioSegment.from_file(user_audio_path)
|
616 |
audio = audio.set_frame_rate(SAMPLE_RATE).set_channels(1)
|
617 |
|
618 |
# Save processed audio
|
619 |
+
processed_path = os.path.join(temp_dir, "processed_user_audio.wav")
|
620 |
audio.export(processed_path, format="wav")
|
621 |
+
logger.debug(f"[{request_id}] π Processed user audio saved to {processed_path}")
|
622 |
|
623 |
# Load the processed audio for ASR
|
624 |
user_waveform, sr = torchaudio.load(processed_path)
|
625 |
user_waveform = user_waveform.squeeze().numpy()
|
626 |
+
logger.info(f"[{request_id}] β
User audio processed successfully: {sr}Hz, length: {len(user_waveform)} samples")
|
627 |
|
628 |
# Update user_audio_path to processed file
|
629 |
user_audio_path = processed_path
|
630 |
except Exception as e:
|
631 |
+
logger.error(f"[{request_id}] β Audio processing failed: {str(e)}")
|
632 |
+
logger.debug(f"[{request_id}] Stack trace: {traceback.format_exc()}")
|
633 |
+
# Clean up temp directory
|
634 |
+
try:
|
635 |
+
import shutil
|
636 |
+
shutil.rmtree(temp_dir)
|
637 |
+
except:
|
638 |
+
pass
|
639 |
return jsonify({"error": f"Audio processing failed: {str(e)}"}), 500
|
640 |
|
641 |
# Transcribe user audio
|
642 |
try:
|
643 |
+
logger.info(f"[{request_id}] π Transcribing user audio")
|
644 |
# Process audio for ASR
|
645 |
inputs = asr_processor(
|
646 |
user_waveform,
|
|
|
656 |
ids = torch.argmax(logits, dim=-1)[0]
|
657 |
user_transcription = asr_processor.decode(ids)
|
658 |
|
659 |
+
logger.info(f"[{request_id}] β
User transcription: '{user_transcription}'")
|
660 |
except Exception as e:
|
661 |
+
logger.error(f"[{request_id}] β ASR inference failed: {str(e)}")
|
662 |
+
# Clean up temp directory
|
663 |
+
try:
|
664 |
+
import shutil
|
665 |
+
shutil.rmtree(temp_dir)
|
666 |
+
except:
|
667 |
+
pass
|
668 |
return jsonify({"error": f"ASR inference failed: {str(e)}"}), 500
|
669 |
|
670 |
# Compare with reference audios
|
|
|
673 |
best_reference = None
|
674 |
best_transcription = None
|
675 |
|
676 |
+
logger.info(f"[{request_id}] π Beginning comparison with {len(reference_files)} reference files")
|
677 |
|
678 |
+
for ref_idx, ref_file in enumerate(reference_files):
|
679 |
try:
|
680 |
+
ref_filename = os.path.basename(ref_file)
|
681 |
+
logger.info(f"[{request_id}] π [{ref_idx+1}/{len(reference_files)}] Processing reference file: {ref_filename}")
|
682 |
|
683 |
# Load reference audio using torchaudio instead of librosa
|
684 |
ref_waveform, ref_sr = torchaudio.load(ref_file)
|
685 |
if ref_sr != SAMPLE_RATE:
|
686 |
+
logger.debug(f"[{request_id}] π Resampling reference audio from {ref_sr}Hz to {SAMPLE_RATE}Hz")
|
687 |
ref_waveform = torchaudio.transforms.Resample(ref_sr, SAMPLE_RATE)(ref_waveform)
|
688 |
ref_waveform = ref_waveform.squeeze().numpy()
|
689 |
+
logger.debug(f"[{request_id}] β
Reference audio loaded: {len(ref_waveform)} samples")
|
690 |
|
691 |
# Transcribe reference audio
|
692 |
+
logger.debug(f"[{request_id}] π Transcribing reference audio: {ref_filename}")
|
693 |
inputs = asr_processor(
|
694 |
ref_waveform,
|
695 |
sampling_rate=SAMPLE_RATE,
|
|
|
702 |
logits = asr_model(**inputs).logits
|
703 |
ids = torch.argmax(logits, dim=-1)[0]
|
704 |
ref_transcription = asr_processor.decode(ids)
|
705 |
+
logger.info(f"[{request_id}] β
Reference transcription for {ref_filename}: '{ref_transcription}'")
|
706 |
|
707 |
# Calculate similarity
|
708 |
similarity = calculate_similarity(user_transcription, ref_transcription)
|
709 |
+
logger.info(f"[{request_id}] π Similarity with {ref_filename}: {similarity:.2f}%")
|
710 |
|
711 |
results.append({
|
712 |
+
"reference_file": ref_filename,
|
713 |
"reference_text": ref_transcription,
|
714 |
"similarity_score": similarity
|
715 |
})
|
716 |
|
717 |
if similarity > best_score:
|
718 |
best_score = similarity
|
719 |
+
best_reference = ref_filename
|
720 |
best_transcription = ref_transcription
|
721 |
+
logger.info(f"[{request_id}] π New best match: {best_reference} with score {best_score:.2f}%")
|
722 |
|
723 |
# Add this early exit condition here
|
724 |
if similarity > 80.0: # If we find a really good match
|
725 |
+
logger.info(f"[{request_id}] π Found excellent match (>80%). Stopping evaluation early.")
|
726 |
break # Exit the loop early
|
727 |
|
728 |
except Exception as e:
|
729 |
+
logger.error(f"[{request_id}] β Error processing reference audio {ref_file}: {str(e)}")
|
730 |
+
logger.debug(f"[{request_id}] Stack trace: {traceback.format_exc()}")
|
731 |
|
732 |
# Clean up temp files
|
733 |
try:
|
734 |
+
import shutil
|
735 |
+
shutil.rmtree(temp_dir)
|
736 |
+
logger.debug(f"[{request_id}] π§Ή Cleaned up temporary directory: {temp_dir}")
|
737 |
except Exception as e:
|
738 |
+
logger.warning(f"[{request_id}] β οΈ Failed to clean up temp files: {str(e)}")
|
739 |
|
740 |
# Enhanced feedback based on score range
|
741 |
is_correct = best_score >= 70.0
|
|
|
752 |
else:
|
753 |
feedback = "Try again. Listen carefully to the sample pronunciation."
|
754 |
|
755 |
+
logger.info(f"[{request_id}] π Final evaluation results: score={best_score:.2f}%, is_correct={is_correct}")
|
756 |
+
logger.info(f"[{request_id}] π Feedback: '{feedback}'")
|
757 |
+
logger.info(f"[{request_id}] β
Evaluation complete")
|
758 |
|
759 |
# Sort results by score descending
|
760 |
results.sort(key=lambda x: x["similarity_score"], reverse=True)
|
|
|
770 |
})
|
771 |
|
772 |
except Exception as e:
|
773 |
+
logger.error(f"[{request_id}] β Unhandled exception in evaluation endpoint: {str(e)}")
|
774 |
+
logger.debug(f"[{request_id}] Stack trace: {traceback.format_exc()}")
|
775 |
return jsonify({"error": f"Internal server error: {str(e)}"}), 500
|
776 |
|
777 |
@app.route("/upload_reference", methods=["POST"])
|
|
|
789 |
# Validate reference word
|
790 |
reference_patterns = [
|
791 |
"mayap_a_abak", "mayap_a_ugtu", "mayap_a_gatpanapun", "mayap_a_bengi", "komusta_ka", "malaus_ko_pu","malaus_kayu","agaganaka_da_ka", "pagdulapan_da_ka","kaluguran_da_ka","dakal_a_salamat","panapaya_mu_ku"
|
|
|
|
|
|
|
|
|
|
|
792 |
]
|
793 |
|
794 |
if reference_word not in reference_patterns:
|