Coco-18 commited on
Commit
46a80fc
Β·
verified Β·
1 Parent(s): 69717fb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +249 -0
app.py CHANGED
@@ -4,6 +4,7 @@ import sys
4
  import logging
5
  import traceback
6
 
 
7
  # Configure logging
8
  logging.basicConfig(
9
  level=logging.INFO,
@@ -32,6 +33,10 @@ for env_var, path in cache_dirs.items():
32
 
33
  # Now import the rest of the libraries
34
  try:
 
 
 
 
35
  import torch
36
  from pydub import AudioSegment
37
  import tempfile
@@ -41,6 +46,8 @@ try:
41
  from flask_cors import CORS
42
  from transformers import Wav2Vec2ForCTC, AutoProcessor, VitsModel, AutoTokenizer
43
  from transformers import MarianMTModel, MarianTokenizer
 
 
44
  logger.info("βœ… All required libraries imported successfully")
45
  except ImportError as e:
46
  logger.critical(f"❌ Failed to import necessary libraries: {str(e)}")
@@ -162,6 +169,9 @@ for model_key, model_id in TRANSLATION_MODELS.items():
162
  # Constants
163
  SAMPLE_RATE = 16000
164
  OUTPUT_DIR = "/tmp/audio_outputs"
 
 
 
165
  try:
166
  os.makedirs(OUTPUT_DIR, exist_ok=True)
167
  logger.info(f"πŸ“ Created output directory: {OUTPUT_DIR}")
@@ -498,6 +508,245 @@ def translate_text():
498
  logger.debug(f"Stack trace: {traceback.format_exc()}")
499
  return jsonify({"error": f"Internal server error: {str(e)}"}), 500
500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
501
  if __name__ == "__main__":
502
  logger.info("πŸš€ Starting Speech API server")
503
  logger.info(f"πŸ“Š System status: ASR model: {'βœ…' if asr_model else '❌'}")
 
4
  import logging
5
  import traceback
6
 
7
+
8
  # Configure logging
9
  logging.basicConfig(
10
  level=logging.INFO,
 
33
 
34
  # Now import the rest of the libraries
35
  try:
36
+ import librosa
37
+ from difflib import SequenceMatcher
38
+ import glob
39
+ import numpy as np
40
  import torch
41
  from pydub import AudioSegment
42
  import tempfile
 
46
  from flask_cors import CORS
47
  from transformers import Wav2Vec2ForCTC, AutoProcessor, VitsModel, AutoTokenizer
48
  from transformers import MarianMTModel, MarianTokenizer
49
+ from werkzeug.utils import secure_filename
50
+
51
  logger.info("βœ… All required libraries imported successfully")
52
  except ImportError as e:
53
  logger.critical(f"❌ Failed to import necessary libraries: {str(e)}")
 
169
  # Constants
170
  SAMPLE_RATE = 16000
171
  OUTPUT_DIR = "/tmp/audio_outputs"
172
+ # Update the constant
173
+ REFERENCE_AUDIO_DIR = "/storage/reference_audio"
174
+
175
  try:
176
  os.makedirs(OUTPUT_DIR, exist_ok=True)
177
  logger.info(f"πŸ“ Created output directory: {OUTPUT_DIR}")
 
508
  logger.debug(f"Stack trace: {traceback.format_exc()}")
509
  return jsonify({"error": f"Internal server error: {str(e)}"}), 500
510
 
511
+ # Add this function to your app.py
512
+ def calculate_similarity(text1, text2):
513
+ """Calculate text similarity percentage."""
514
+ def clean_text(text):
515
+ return text.lower()
516
+
517
+ clean1 = clean_text(text1)
518
+ clean2 = clean_text(text2)
519
+
520
+ matcher = SequenceMatcher(None, clean1, clean2)
521
+ return matcher.ratio() * 100
522
+
523
+ # Add this route to your Flask app
524
+ @app.route("/evaluate", methods=["POST"])
525
+ def evaluate_pronunciation():
526
+ if asr_model is None or asr_processor is None:
527
+ logger.error("❌ Evaluation endpoint called but ASR models aren't loaded")
528
+ return jsonify({"error": "ASR model not available"}), 503
529
+
530
+ try:
531
+ if "audio" not in request.files:
532
+ logger.warning("⚠️ Evaluation request missing audio file")
533
+ return jsonify({"error": "No audio file uploaded"}), 400
534
+
535
+ audio_file = request.files["audio"]
536
+ reference_word = request.form.get("reference_word", "").strip()
537
+ language = request.form.get("language", "tagalog").lower() # Default to tagalog for ASR
538
+
539
+ # Check if reference word is valid
540
+ reference_patterns = [
541
+ "mayap_a_abak", "mayap_a_ugtu", "mayap_a_gatpanapun",
542
+ "mayap_a_bengi", "komusta_ka"
543
+ ]
544
+
545
+ if not reference_word or reference_word not in reference_patterns:
546
+ logger.warning(f"⚠️ Invalid reference word: {reference_word}")
547
+ return jsonify({"error": f"Invalid reference word. Available: {reference_patterns}"}), 400
548
+
549
+ lang_code = LANGUAGE_CODES.get(language, language)
550
+ logger.info(f"πŸ”„ Evaluating pronunciation of '{reference_word}' in {language}")
551
+
552
+ # Save the uploaded file temporarily
553
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
554
+ temp_audio.write(audio_file.read())
555
+ user_audio_path = temp_audio.name
556
+ logger.debug(f"πŸ“ User audio saved to {user_audio_path}")
557
+
558
+ # Convert to WAV if necessary and ensure correct format
559
+ try:
560
+ # Load audio with librosa for consistent processing
561
+ user_waveform, sr = librosa.load(user_audio_path, sr=SAMPLE_RATE, mono=True)
562
+
563
+ # Save processed audio
564
+ processed_path = os.path.join(OUTPUT_DIR, "processed_user_audio.wav")
565
+ sf.write(processed_path, user_waveform, SAMPLE_RATE)
566
+ logger.debug(f"πŸ“ Processed user audio saved to {processed_path}")
567
+
568
+ # Update user_audio_path to processed file
569
+ user_audio_path = processed_path
570
+ except Exception as e:
571
+ logger.error(f"❌ Audio processing failed: {str(e)}")
572
+ return jsonify({"error": f"Audio processing failed: {str(e)}"}), 500
573
+
574
+ # Find reference audio files
575
+ reference_dir = os.path.join(REFERENCE_AUDIO_DIR, reference_word)
576
+ if not os.path.exists(reference_dir):
577
+ logger.warning(f"⚠️ Reference directory not found: {reference_dir}")
578
+ return jsonify({"error": f"Reference audio for {reference_word} not found"}), 404
579
+
580
+ reference_files = glob.glob(os.path.join(reference_dir, "*.wav"))
581
+ if not reference_files:
582
+ logger.warning(f"⚠️ No reference audio files found in {reference_dir}")
583
+ return jsonify({"error": f"No reference audio found for {reference_word}"}), 404
584
+
585
+ logger.info(f"πŸ“Š Found {len(reference_files)} reference files for '{reference_word}'")
586
+
587
+ # Transcribe user audio
588
+ try:
589
+ # Process audio for ASR
590
+ inputs = asr_processor(
591
+ user_waveform,
592
+ sampling_rate=SAMPLE_RATE,
593
+ return_tensors="pt",
594
+ language=lang_code
595
+ )
596
+ inputs = {k: v.to(device) for k, v in inputs.items()}
597
+
598
+ # Perform ASR
599
+ with torch.no_grad():
600
+ logits = asr_model(**inputs).logits
601
+ ids = torch.argmax(logits, dim=-1)[0]
602
+ user_transcription = asr_processor.decode(ids)
603
+
604
+ logger.info(f"βœ… User transcription: {user_transcription}")
605
+ except Exception as e:
606
+ logger.error(f"❌ ASR inference failed: {str(e)}")
607
+ return jsonify({"error": f"ASR inference failed: {str(e)}"}), 500
608
+
609
+ # Compare with reference audios
610
+ results = []
611
+ best_score = 0
612
+ best_reference = None
613
+
614
+ for ref_file in reference_files:
615
+ try:
616
+ # Load reference audio
617
+ ref_waveform, _ = librosa.load(ref_file, sr=SAMPLE_RATE, mono=True)
618
+
619
+ # Transcribe reference audio
620
+ inputs = asr_processor(
621
+ ref_waveform,
622
+ sampling_rate=SAMPLE_RATE,
623
+ return_tensors="pt",
624
+ language=lang_code
625
+ )
626
+ inputs = {k: v.to(device) for k, v in inputs.items()}
627
+
628
+ with torch.no_grad():
629
+ logits = asr_model(**inputs).logits
630
+ ids = torch.argmax(logits, dim=-1)[0]
631
+ ref_transcription = asr_processor.decode(ids)
632
+
633
+ # Calculate similarity
634
+ similarity = calculate_similarity(user_transcription, ref_transcription)
635
+
636
+ results.append({
637
+ "reference_file": os.path.basename(ref_file),
638
+ "reference_text": ref_transcription,
639
+ "similarity_score": similarity
640
+ })
641
+
642
+ if similarity > best_score:
643
+ best_score = similarity
644
+ best_reference = os.path.basename(ref_file)
645
+
646
+ logger.debug(f"πŸ“Š Reference '{os.path.basename(ref_file)}': {similarity:.2f}%")
647
+ except Exception as e:
648
+ logger.error(f"❌ Error processing reference audio {ref_file}: {str(e)}")
649
+
650
+ # Clean up temp files
651
+ try:
652
+ if os.path.exists(user_audio_path) and user_audio_path != processed_path:
653
+ os.unlink(user_audio_path)
654
+ except Exception as e:
655
+ logger.warning(f"⚠️ Failed to clean up temp files: {str(e)}")
656
+
657
+ # Decision on pronunciation correctness (70% threshold)
658
+ is_correct = best_score >= 70.0
659
+ feedback = "Great pronunciation!" if is_correct else "Try again! Listen to the sample"
660
+
661
+ return jsonify({
662
+ "is_correct": is_correct,
663
+ "score": best_score,
664
+ "feedback": feedback,
665
+ "transcription": user_transcription,
666
+ "reference_word": reference_word,
667
+ "details": results
668
+ })
669
+
670
+ except Exception as e:
671
+ logger.error(f"❌ Unhandled exception in evaluation endpoint: {str(e)}")
672
+ logger.debug(f"Stack trace: {traceback.format_exc()}")
673
+ return jsonify({"error": f"Internal server error: {str(e)}"}), 500
674
+
675
+ @app.route("/upload_reference", methods=["POST"])
676
+ def upload_reference_audio():
677
+ try:
678
+ if "audio" not in request.files:
679
+ logger.warning("⚠️ Reference upload missing audio file")
680
+ return jsonify({"error": "No audio file uploaded"}), 400
681
+
682
+ reference_word = request.form.get("reference_word", "").strip()
683
+ if not reference_word:
684
+ logger.warning("⚠️ Reference upload missing reference word")
685
+ return jsonify({"error": "No reference word provided"}), 400
686
+
687
+ # Validate reference word
688
+ reference_patterns = [
689
+ "mayap_a_abak", "mayap_a_ugtu", "mayap_a_gatpanapun",
690
+ "mayap_a_bengi", "komusta_ka"
691
+ ]
692
+
693
+ if reference_word not in reference_patterns:
694
+ logger.warning(f"⚠️ Invalid reference word: {reference_word}")
695
+ return jsonify({"error": f"Invalid reference word. Available: {reference_patterns}"}), 400
696
+
697
+ # Create directory for reference pattern if it doesn't exist
698
+ pattern_dir = os.path.join(REFERENCE_AUDIO_DIR, reference_word)
699
+ os.makedirs(pattern_dir, exist_ok=True)
700
+
701
+ # Save the reference audio file
702
+ audio_file = request.files["audio"]
703
+ file_path = os.path.join(pattern_dir, secure_filename(audio_file.filename))
704
+ audio_file.save(file_path)
705
+
706
+ # Convert to WAV if not already in that format
707
+ if not file_path.lower().endswith('.wav'):
708
+ base_path = os.path.splitext(file_path)[0]
709
+ wav_path = f"{base_path}.wav"
710
+ try:
711
+ audio = AudioSegment.from_file(file_path)
712
+ audio = audio.set_frame_rate(SAMPLE_RATE).set_channels(1)
713
+ audio.export(wav_path, format="wav")
714
+ # Remove original file if conversion successful
715
+ os.unlink(file_path)
716
+ file_path = wav_path
717
+ except Exception as e:
718
+ logger.error(f"❌ Reference audio conversion failed: {str(e)}")
719
+ return jsonify({"error": f"Audio conversion failed: {str(e)}"}), 500
720
+
721
+ logger.info(f"βœ… Reference audio saved successfully for {reference_word}: {file_path}")
722
+
723
+ # Count how many references we have now
724
+ references = glob.glob(os.path.join(pattern_dir, "*.wav"))
725
+ return jsonify({
726
+ "message": "Reference audio uploaded successfully",
727
+ "reference_word": reference_word,
728
+ "file": os.path.basename(file_path),
729
+ "total_references": len(references)
730
+ })
731
+
732
+ except Exception as e:
733
+ logger.error(f"❌ Unhandled exception in reference upload: {str(e)}")
734
+ logger.debug(f"Stack trace: {traceback.format_exc()}")
735
+ return jsonify({"error": f"Internal server error: {str(e)}"}), 500
736
+
737
+ # Ensure directory exists
738
+ @app.before_first_request
739
+ def setup_reference_audio():
740
+ try:
741
+ os.makedirs(REFERENCE_AUDIO_DIR, exist_ok=True)
742
+ logger.info(f"πŸ“ Created reference audio directory: {REFERENCE_AUDIO_DIR}")
743
+
744
+ # Rest of your existing setup code...
745
+ except Exception as e:
746
+ logger.error(f"❌ Failed to set up reference audio directory: {str(e)}")
747
+
748
+
749
+
750
  if __name__ == "__main__":
751
  logger.info("πŸš€ Starting Speech API server")
752
  logger.info(f"πŸ“Š System status: ASR model: {'βœ…' if asr_model else '❌'}")