Update app.py
Browse files
app.py
CHANGED
@@ -4,6 +4,7 @@ import sys
|
|
4 |
import logging
|
5 |
import traceback
|
6 |
|
|
|
7 |
# Configure logging
|
8 |
logging.basicConfig(
|
9 |
level=logging.INFO,
|
@@ -32,6 +33,10 @@ for env_var, path in cache_dirs.items():
|
|
32 |
|
33 |
# Now import the rest of the libraries
|
34 |
try:
|
|
|
|
|
|
|
|
|
35 |
import torch
|
36 |
from pydub import AudioSegment
|
37 |
import tempfile
|
@@ -41,6 +46,8 @@ try:
|
|
41 |
from flask_cors import CORS
|
42 |
from transformers import Wav2Vec2ForCTC, AutoProcessor, VitsModel, AutoTokenizer
|
43 |
from transformers import MarianMTModel, MarianTokenizer
|
|
|
|
|
44 |
logger.info("β
All required libraries imported successfully")
|
45 |
except ImportError as e:
|
46 |
logger.critical(f"β Failed to import necessary libraries: {str(e)}")
|
@@ -162,6 +169,9 @@ for model_key, model_id in TRANSLATION_MODELS.items():
|
|
162 |
# Constants
|
163 |
SAMPLE_RATE = 16000
|
164 |
OUTPUT_DIR = "/tmp/audio_outputs"
|
|
|
|
|
|
|
165 |
try:
|
166 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
167 |
logger.info(f"π Created output directory: {OUTPUT_DIR}")
|
@@ -498,6 +508,245 @@ def translate_text():
|
|
498 |
logger.debug(f"Stack trace: {traceback.format_exc()}")
|
499 |
return jsonify({"error": f"Internal server error: {str(e)}"}), 500
|
500 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
501 |
if __name__ == "__main__":
|
502 |
logger.info("π Starting Speech API server")
|
503 |
logger.info(f"π System status: ASR model: {'β
' if asr_model else 'β'}")
|
|
|
4 |
import logging
|
5 |
import traceback
|
6 |
|
7 |
+
|
8 |
# Configure logging
|
9 |
logging.basicConfig(
|
10 |
level=logging.INFO,
|
|
|
33 |
|
34 |
# Now import the rest of the libraries
|
35 |
try:
|
36 |
+
import librosa
|
37 |
+
from difflib import SequenceMatcher
|
38 |
+
import glob
|
39 |
+
import numpy as np
|
40 |
import torch
|
41 |
from pydub import AudioSegment
|
42 |
import tempfile
|
|
|
46 |
from flask_cors import CORS
|
47 |
from transformers import Wav2Vec2ForCTC, AutoProcessor, VitsModel, AutoTokenizer
|
48 |
from transformers import MarianMTModel, MarianTokenizer
|
49 |
+
from werkzeug.utils import secure_filename
|
50 |
+
|
51 |
logger.info("β
All required libraries imported successfully")
|
52 |
except ImportError as e:
|
53 |
logger.critical(f"β Failed to import necessary libraries: {str(e)}")
|
|
|
169 |
# Constants
|
170 |
SAMPLE_RATE = 16000
|
171 |
OUTPUT_DIR = "/tmp/audio_outputs"
|
172 |
+
# Update the constant
|
173 |
+
REFERENCE_AUDIO_DIR = "/storage/reference_audio"
|
174 |
+
|
175 |
try:
|
176 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
177 |
logger.info(f"π Created output directory: {OUTPUT_DIR}")
|
|
|
508 |
logger.debug(f"Stack trace: {traceback.format_exc()}")
|
509 |
return jsonify({"error": f"Internal server error: {str(e)}"}), 500
|
510 |
|
511 |
+
# Add this function to your app.py
|
512 |
+
def calculate_similarity(text1, text2):
|
513 |
+
"""Calculate text similarity percentage."""
|
514 |
+
def clean_text(text):
|
515 |
+
return text.lower()
|
516 |
+
|
517 |
+
clean1 = clean_text(text1)
|
518 |
+
clean2 = clean_text(text2)
|
519 |
+
|
520 |
+
matcher = SequenceMatcher(None, clean1, clean2)
|
521 |
+
return matcher.ratio() * 100
|
522 |
+
|
523 |
+
# Add this route to your Flask app
|
524 |
+
@app.route("/evaluate", methods=["POST"])
|
525 |
+
def evaluate_pronunciation():
|
526 |
+
if asr_model is None or asr_processor is None:
|
527 |
+
logger.error("β Evaluation endpoint called but ASR models aren't loaded")
|
528 |
+
return jsonify({"error": "ASR model not available"}), 503
|
529 |
+
|
530 |
+
try:
|
531 |
+
if "audio" not in request.files:
|
532 |
+
logger.warning("β οΈ Evaluation request missing audio file")
|
533 |
+
return jsonify({"error": "No audio file uploaded"}), 400
|
534 |
+
|
535 |
+
audio_file = request.files["audio"]
|
536 |
+
reference_word = request.form.get("reference_word", "").strip()
|
537 |
+
language = request.form.get("language", "tagalog").lower() # Default to tagalog for ASR
|
538 |
+
|
539 |
+
# Check if reference word is valid
|
540 |
+
reference_patterns = [
|
541 |
+
"mayap_a_abak", "mayap_a_ugtu", "mayap_a_gatpanapun",
|
542 |
+
"mayap_a_bengi", "komusta_ka"
|
543 |
+
]
|
544 |
+
|
545 |
+
if not reference_word or reference_word not in reference_patterns:
|
546 |
+
logger.warning(f"β οΈ Invalid reference word: {reference_word}")
|
547 |
+
return jsonify({"error": f"Invalid reference word. Available: {reference_patterns}"}), 400
|
548 |
+
|
549 |
+
lang_code = LANGUAGE_CODES.get(language, language)
|
550 |
+
logger.info(f"π Evaluating pronunciation of '{reference_word}' in {language}")
|
551 |
+
|
552 |
+
# Save the uploaded file temporarily
|
553 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
|
554 |
+
temp_audio.write(audio_file.read())
|
555 |
+
user_audio_path = temp_audio.name
|
556 |
+
logger.debug(f"π User audio saved to {user_audio_path}")
|
557 |
+
|
558 |
+
# Convert to WAV if necessary and ensure correct format
|
559 |
+
try:
|
560 |
+
# Load audio with librosa for consistent processing
|
561 |
+
user_waveform, sr = librosa.load(user_audio_path, sr=SAMPLE_RATE, mono=True)
|
562 |
+
|
563 |
+
# Save processed audio
|
564 |
+
processed_path = os.path.join(OUTPUT_DIR, "processed_user_audio.wav")
|
565 |
+
sf.write(processed_path, user_waveform, SAMPLE_RATE)
|
566 |
+
logger.debug(f"π Processed user audio saved to {processed_path}")
|
567 |
+
|
568 |
+
# Update user_audio_path to processed file
|
569 |
+
user_audio_path = processed_path
|
570 |
+
except Exception as e:
|
571 |
+
logger.error(f"β Audio processing failed: {str(e)}")
|
572 |
+
return jsonify({"error": f"Audio processing failed: {str(e)}"}), 500
|
573 |
+
|
574 |
+
# Find reference audio files
|
575 |
+
reference_dir = os.path.join(REFERENCE_AUDIO_DIR, reference_word)
|
576 |
+
if not os.path.exists(reference_dir):
|
577 |
+
logger.warning(f"β οΈ Reference directory not found: {reference_dir}")
|
578 |
+
return jsonify({"error": f"Reference audio for {reference_word} not found"}), 404
|
579 |
+
|
580 |
+
reference_files = glob.glob(os.path.join(reference_dir, "*.wav"))
|
581 |
+
if not reference_files:
|
582 |
+
logger.warning(f"β οΈ No reference audio files found in {reference_dir}")
|
583 |
+
return jsonify({"error": f"No reference audio found for {reference_word}"}), 404
|
584 |
+
|
585 |
+
logger.info(f"π Found {len(reference_files)} reference files for '{reference_word}'")
|
586 |
+
|
587 |
+
# Transcribe user audio
|
588 |
+
try:
|
589 |
+
# Process audio for ASR
|
590 |
+
inputs = asr_processor(
|
591 |
+
user_waveform,
|
592 |
+
sampling_rate=SAMPLE_RATE,
|
593 |
+
return_tensors="pt",
|
594 |
+
language=lang_code
|
595 |
+
)
|
596 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
597 |
+
|
598 |
+
# Perform ASR
|
599 |
+
with torch.no_grad():
|
600 |
+
logits = asr_model(**inputs).logits
|
601 |
+
ids = torch.argmax(logits, dim=-1)[0]
|
602 |
+
user_transcription = asr_processor.decode(ids)
|
603 |
+
|
604 |
+
logger.info(f"β
User transcription: {user_transcription}")
|
605 |
+
except Exception as e:
|
606 |
+
logger.error(f"β ASR inference failed: {str(e)}")
|
607 |
+
return jsonify({"error": f"ASR inference failed: {str(e)}"}), 500
|
608 |
+
|
609 |
+
# Compare with reference audios
|
610 |
+
results = []
|
611 |
+
best_score = 0
|
612 |
+
best_reference = None
|
613 |
+
|
614 |
+
for ref_file in reference_files:
|
615 |
+
try:
|
616 |
+
# Load reference audio
|
617 |
+
ref_waveform, _ = librosa.load(ref_file, sr=SAMPLE_RATE, mono=True)
|
618 |
+
|
619 |
+
# Transcribe reference audio
|
620 |
+
inputs = asr_processor(
|
621 |
+
ref_waveform,
|
622 |
+
sampling_rate=SAMPLE_RATE,
|
623 |
+
return_tensors="pt",
|
624 |
+
language=lang_code
|
625 |
+
)
|
626 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
627 |
+
|
628 |
+
with torch.no_grad():
|
629 |
+
logits = asr_model(**inputs).logits
|
630 |
+
ids = torch.argmax(logits, dim=-1)[0]
|
631 |
+
ref_transcription = asr_processor.decode(ids)
|
632 |
+
|
633 |
+
# Calculate similarity
|
634 |
+
similarity = calculate_similarity(user_transcription, ref_transcription)
|
635 |
+
|
636 |
+
results.append({
|
637 |
+
"reference_file": os.path.basename(ref_file),
|
638 |
+
"reference_text": ref_transcription,
|
639 |
+
"similarity_score": similarity
|
640 |
+
})
|
641 |
+
|
642 |
+
if similarity > best_score:
|
643 |
+
best_score = similarity
|
644 |
+
best_reference = os.path.basename(ref_file)
|
645 |
+
|
646 |
+
logger.debug(f"π Reference '{os.path.basename(ref_file)}': {similarity:.2f}%")
|
647 |
+
except Exception as e:
|
648 |
+
logger.error(f"β Error processing reference audio {ref_file}: {str(e)}")
|
649 |
+
|
650 |
+
# Clean up temp files
|
651 |
+
try:
|
652 |
+
if os.path.exists(user_audio_path) and user_audio_path != processed_path:
|
653 |
+
os.unlink(user_audio_path)
|
654 |
+
except Exception as e:
|
655 |
+
logger.warning(f"β οΈ Failed to clean up temp files: {str(e)}")
|
656 |
+
|
657 |
+
# Decision on pronunciation correctness (70% threshold)
|
658 |
+
is_correct = best_score >= 70.0
|
659 |
+
feedback = "Great pronunciation!" if is_correct else "Try again! Listen to the sample"
|
660 |
+
|
661 |
+
return jsonify({
|
662 |
+
"is_correct": is_correct,
|
663 |
+
"score": best_score,
|
664 |
+
"feedback": feedback,
|
665 |
+
"transcription": user_transcription,
|
666 |
+
"reference_word": reference_word,
|
667 |
+
"details": results
|
668 |
+
})
|
669 |
+
|
670 |
+
except Exception as e:
|
671 |
+
logger.error(f"β Unhandled exception in evaluation endpoint: {str(e)}")
|
672 |
+
logger.debug(f"Stack trace: {traceback.format_exc()}")
|
673 |
+
return jsonify({"error": f"Internal server error: {str(e)}"}), 500
|
674 |
+
|
675 |
+
@app.route("/upload_reference", methods=["POST"])
|
676 |
+
def upload_reference_audio():
|
677 |
+
try:
|
678 |
+
if "audio" not in request.files:
|
679 |
+
logger.warning("β οΈ Reference upload missing audio file")
|
680 |
+
return jsonify({"error": "No audio file uploaded"}), 400
|
681 |
+
|
682 |
+
reference_word = request.form.get("reference_word", "").strip()
|
683 |
+
if not reference_word:
|
684 |
+
logger.warning("β οΈ Reference upload missing reference word")
|
685 |
+
return jsonify({"error": "No reference word provided"}), 400
|
686 |
+
|
687 |
+
# Validate reference word
|
688 |
+
reference_patterns = [
|
689 |
+
"mayap_a_abak", "mayap_a_ugtu", "mayap_a_gatpanapun",
|
690 |
+
"mayap_a_bengi", "komusta_ka"
|
691 |
+
]
|
692 |
+
|
693 |
+
if reference_word not in reference_patterns:
|
694 |
+
logger.warning(f"β οΈ Invalid reference word: {reference_word}")
|
695 |
+
return jsonify({"error": f"Invalid reference word. Available: {reference_patterns}"}), 400
|
696 |
+
|
697 |
+
# Create directory for reference pattern if it doesn't exist
|
698 |
+
pattern_dir = os.path.join(REFERENCE_AUDIO_DIR, reference_word)
|
699 |
+
os.makedirs(pattern_dir, exist_ok=True)
|
700 |
+
|
701 |
+
# Save the reference audio file
|
702 |
+
audio_file = request.files["audio"]
|
703 |
+
file_path = os.path.join(pattern_dir, secure_filename(audio_file.filename))
|
704 |
+
audio_file.save(file_path)
|
705 |
+
|
706 |
+
# Convert to WAV if not already in that format
|
707 |
+
if not file_path.lower().endswith('.wav'):
|
708 |
+
base_path = os.path.splitext(file_path)[0]
|
709 |
+
wav_path = f"{base_path}.wav"
|
710 |
+
try:
|
711 |
+
audio = AudioSegment.from_file(file_path)
|
712 |
+
audio = audio.set_frame_rate(SAMPLE_RATE).set_channels(1)
|
713 |
+
audio.export(wav_path, format="wav")
|
714 |
+
# Remove original file if conversion successful
|
715 |
+
os.unlink(file_path)
|
716 |
+
file_path = wav_path
|
717 |
+
except Exception as e:
|
718 |
+
logger.error(f"β Reference audio conversion failed: {str(e)}")
|
719 |
+
return jsonify({"error": f"Audio conversion failed: {str(e)}"}), 500
|
720 |
+
|
721 |
+
logger.info(f"β
Reference audio saved successfully for {reference_word}: {file_path}")
|
722 |
+
|
723 |
+
# Count how many references we have now
|
724 |
+
references = glob.glob(os.path.join(pattern_dir, "*.wav"))
|
725 |
+
return jsonify({
|
726 |
+
"message": "Reference audio uploaded successfully",
|
727 |
+
"reference_word": reference_word,
|
728 |
+
"file": os.path.basename(file_path),
|
729 |
+
"total_references": len(references)
|
730 |
+
})
|
731 |
+
|
732 |
+
except Exception as e:
|
733 |
+
logger.error(f"β Unhandled exception in reference upload: {str(e)}")
|
734 |
+
logger.debug(f"Stack trace: {traceback.format_exc()}")
|
735 |
+
return jsonify({"error": f"Internal server error: {str(e)}"}), 500
|
736 |
+
|
737 |
+
# Ensure directory exists
|
738 |
+
@app.before_first_request
|
739 |
+
def setup_reference_audio():
|
740 |
+
try:
|
741 |
+
os.makedirs(REFERENCE_AUDIO_DIR, exist_ok=True)
|
742 |
+
logger.info(f"π Created reference audio directory: {REFERENCE_AUDIO_DIR}")
|
743 |
+
|
744 |
+
# Rest of your existing setup code...
|
745 |
+
except Exception as e:
|
746 |
+
logger.error(f"β Failed to set up reference audio directory: {str(e)}")
|
747 |
+
|
748 |
+
|
749 |
+
|
750 |
if __name__ == "__main__":
|
751 |
logger.info("π Starting Speech API server")
|
752 |
logger.info(f"π System status: ASR model: {'β
' if asr_model else 'β'}")
|