Update app.py
Browse files
app.py
CHANGED
@@ -4,7 +4,6 @@ import sys
|
|
4 |
import logging
|
5 |
import traceback
|
6 |
|
7 |
-
|
8 |
# Configure logging
|
9 |
logging.basicConfig(
|
10 |
level=logging.INFO,
|
@@ -47,7 +46,7 @@ try:
|
|
47 |
from transformers import Wav2Vec2ForCTC, AutoProcessor, VitsModel, AutoTokenizer
|
48 |
from transformers import MarianMTModel, MarianTokenizer
|
49 |
from werkzeug.utils import secure_filename
|
50 |
-
|
51 |
logger.info("β
All required libraries imported successfully")
|
52 |
except ImportError as e:
|
53 |
logger.critical(f"β Failed to import necessary libraries: {str(e)}")
|
@@ -77,7 +76,7 @@ try:
|
|
77 |
cache_dir=cache_dirs["TRANSFORMERS_CACHE"]
|
78 |
)
|
79 |
logger.info("β
ASR processor loaded successfully")
|
80 |
-
|
81 |
asr_model = Wav2Vec2ForCTC.from_pretrained(
|
82 |
ASR_MODEL_ID,
|
83 |
cache_dir=cache_dirs["TRANSFORMERS_CACHE"]
|
@@ -113,13 +112,13 @@ for lang, model_id in TTS_MODELS.items():
|
|
113 |
logger.info(f"π Loading TTS model for {lang}: {model_id}")
|
114 |
try:
|
115 |
tts_processors[lang] = AutoTokenizer.from_pretrained(
|
116 |
-
model_id,
|
117 |
cache_dir=cache_dirs["TRANSFORMERS_CACHE"]
|
118 |
)
|
119 |
logger.info(f"β
{lang} TTS processor loaded")
|
120 |
-
|
121 |
tts_models[lang] = VitsModel.from_pretrained(
|
122 |
-
model_id,
|
123 |
cache_dir=cache_dirs["TRANSFORMERS_CACHE"]
|
124 |
)
|
125 |
tts_models[lang].to(device)
|
@@ -135,7 +134,7 @@ TRANSLATION_MODELS = {
|
|
135 |
"eng-pam": "Coco-18/opus-mt-en-pam",
|
136 |
"tgl-eng": "Helsinki-NLP/opus-mt-tl-en",
|
137 |
"eng-tgl": "Helsinki-NLP/opus-mt-en-tl",
|
138 |
-
"phi": "Coco-18/opus-mt-phi"
|
139 |
}
|
140 |
|
141 |
logger.info(f"π Loading Translation model: {TRANSLATION_MODELS}")
|
@@ -146,14 +145,14 @@ translation_tokenizers = {}
|
|
146 |
|
147 |
for model_key, model_id in TRANSLATION_MODELS.items():
|
148 |
logger.info(f"π Loading Translation model: {model_id}")
|
149 |
-
|
150 |
try:
|
151 |
translation_tokenizers[model_key] = MarianTokenizer.from_pretrained(
|
152 |
model_id,
|
153 |
cache_dir=cache_dirs["TRANSFORMERS_CACHE"]
|
154 |
)
|
155 |
logger.info(f"β
Translation tokenizer loaded successfully for {model_key}")
|
156 |
-
|
157 |
translation_models[model_key] = MarianMTModel.from_pretrained(
|
158 |
model_id,
|
159 |
cache_dir=cache_dirs["TRANSFORMERS_CACHE"]
|
@@ -169,7 +168,7 @@ for model_key, model_id in TRANSLATION_MODELS.items():
|
|
169 |
# Constants
|
170 |
SAMPLE_RATE = 16000
|
171 |
OUTPUT_DIR = "/tmp/audio_outputs"
|
172 |
-
REFERENCE_AUDIO_DIR = "./reference_audios"
|
173 |
|
174 |
try:
|
175 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
@@ -177,43 +176,47 @@ try:
|
|
177 |
except Exception as e:
|
178 |
logger.error(f"β Failed to create output directory: {str(e)}")
|
179 |
|
|
|
180 |
@app.route("/", methods=["GET"])
|
181 |
def home():
|
182 |
return jsonify({"message": "Speech API is running", "status": "active"})
|
183 |
|
|
|
184 |
@app.route("/health", methods=["GET"])
|
185 |
def health_check():
|
186 |
# Initialize direct language pair statuses based on loaded models
|
187 |
translation_status = {}
|
188 |
-
|
189 |
# Add status for direct model pairs
|
190 |
for lang_pair in ["pam-eng", "eng-pam", "tgl-eng", "eng-tgl"]:
|
191 |
-
translation_status[lang_pair] = "loaded" if lang_pair in translation_models and translation_models[
|
192 |
-
|
|
|
193 |
# Add special phi model status
|
194 |
phi_status = "loaded" if "phi" in translation_models and translation_models["phi"] is not None else "failed"
|
195 |
translation_status["pam-fil"] = phi_status
|
196 |
translation_status["fil-pam"] = phi_status
|
197 |
translation_status["pam-tgl"] = phi_status # Using phi model but replacing tgl with fil
|
198 |
translation_status["tgl-pam"] = phi_status # Using phi model but replacing tgl with fil
|
199 |
-
|
200 |
health_status = {
|
201 |
"api_status": "online",
|
202 |
"asr_model": "loaded" if asr_model is not None else "failed",
|
203 |
-
"tts_models": {lang: "loaded" if model is not None else "failed"
|
204 |
-
|
205 |
"translation_models": translation_status,
|
206 |
"device": device
|
207 |
}
|
208 |
return jsonify(health_status)
|
209 |
|
|
|
210 |
@app.route("/check_references", methods=["GET"])
|
211 |
def check_references():
|
212 |
"""Endpoint to check if reference files exist and are accessible"""
|
213 |
-
ref_patterns = ["mayap_a_abak", "mayap_a_ugtu", "mayap_a_gatpanapun",
|
214 |
"mayap_a_bengi", "komusta_ka"]
|
215 |
results = {}
|
216 |
-
|
217 |
for pattern in ref_patterns:
|
218 |
pattern_dir = os.path.join(REFERENCE_AUDIO_DIR, pattern)
|
219 |
if os.path.exists(pattern_dir):
|
@@ -229,19 +232,20 @@ def check_references():
|
|
229 |
"exists": False,
|
230 |
"path": pattern_dir
|
231 |
}
|
232 |
-
|
233 |
return jsonify({
|
234 |
"reference_audio_dir": REFERENCE_AUDIO_DIR,
|
235 |
"directory_exists": os.path.exists(REFERENCE_AUDIO_DIR),
|
236 |
"patterns": results
|
237 |
})
|
238 |
|
|
|
239 |
@app.route("/asr", methods=["POST"])
|
240 |
def transcribe_audio():
|
241 |
if asr_model is None or asr_processor is None:
|
242 |
logger.error("β ASR endpoint called but models aren't loaded")
|
243 |
return jsonify({"error": "ASR model not available"}), 503
|
244 |
-
|
245 |
try:
|
246 |
if "audio" not in request.files:
|
247 |
logger.warning("β οΈ ASR request missing audio file")
|
@@ -252,7 +256,8 @@ def transcribe_audio():
|
|
252 |
|
253 |
if language not in LANGUAGE_CODES:
|
254 |
logger.warning(f"β οΈ Unsupported language requested: {language}")
|
255 |
-
return jsonify(
|
|
|
256 |
|
257 |
lang_code = LANGUAGE_CODES[language]
|
258 |
logger.info(f"π Processing {language} audio for ASR")
|
@@ -310,9 +315,9 @@ def transcribe_audio():
|
|
310 |
logits = asr_model(**inputs).logits
|
311 |
ids = torch.argmax(logits, dim=-1)[0]
|
312 |
transcription = asr_processor.decode(ids)
|
313 |
-
|
314 |
logger.info(f"β
Transcription ({language}): {transcription}")
|
315 |
-
|
316 |
# Clean up temp files
|
317 |
try:
|
318 |
os.unlink(temp_audio_path)
|
@@ -320,7 +325,7 @@ def transcribe_audio():
|
|
320 |
os.unlink(wav_path)
|
321 |
except Exception as e:
|
322 |
logger.warning(f"β οΈ Failed to clean up temp files: {str(e)}")
|
323 |
-
|
324 |
return jsonify({
|
325 |
"transcription": transcription,
|
326 |
"language": language,
|
@@ -344,24 +349,24 @@ def generate_tts():
|
|
344 |
if not data:
|
345 |
logger.warning("β οΈ TTS endpoint called with no JSON data")
|
346 |
return jsonify({"error": "No JSON data provided"}), 400
|
347 |
-
|
348 |
text_input = data.get("text", "").strip()
|
349 |
language = data.get("language", "kapampangan").lower()
|
350 |
|
351 |
if not text_input:
|
352 |
logger.warning("β οΈ TTS request with empty text")
|
353 |
return jsonify({"error": "No text provided"}), 400
|
354 |
-
|
355 |
if language not in TTS_MODELS:
|
356 |
logger.warning(f"β οΈ TTS requested for unsupported language: {language}")
|
357 |
return jsonify({"error": f"Invalid language. Available options: {list(TTS_MODELS.keys())}"}), 400
|
358 |
-
|
359 |
if tts_models[language] is None:
|
360 |
logger.error(f"β TTS model for {language} not loaded")
|
361 |
return jsonify({"error": f"TTS model for {language} not available"}), 503
|
362 |
|
363 |
logger.info(f"π Generating TTS for language: {language}, text: '{text_input}'")
|
364 |
-
|
365 |
try:
|
366 |
processor = tts_processors[language]
|
367 |
model = tts_models[language]
|
@@ -409,10 +414,11 @@ def download_audio(filename):
|
|
409 |
if os.path.exists(file_path):
|
410 |
logger.info(f"π€ Serving audio file: {file_path}")
|
411 |
return send_file(file_path, mimetype="audio/wav", as_attachment=True)
|
412 |
-
|
413 |
logger.warning(f"β οΈ Requested file not found: {file_path}")
|
414 |
return jsonify({"error": "File not found"}), 404
|
415 |
|
|
|
416 |
@app.route("/translate", methods=["POST"])
|
417 |
def translate_text():
|
418 |
try:
|
@@ -420,7 +426,7 @@ def translate_text():
|
|
420 |
if not data:
|
421 |
logger.warning("β οΈ Translation endpoint called with no JSON data")
|
422 |
return jsonify({"error": "No JSON data provided"}), 400
|
423 |
-
|
424 |
source_text = data.get("text", "").strip()
|
425 |
source_language = data.get("source_language", "").lower()
|
426 |
target_language = data.get("target_language", "").lower()
|
@@ -428,18 +434,18 @@ def translate_text():
|
|
428 |
if not source_text:
|
429 |
logger.warning("β οΈ Translation request with empty text")
|
430 |
return jsonify({"error": "No text provided"}), 400
|
431 |
-
|
432 |
# Map language names to codes
|
433 |
source_code = LANGUAGE_CODES.get(source_language, source_language)
|
434 |
target_code = LANGUAGE_CODES.get(target_language, target_language)
|
435 |
-
|
436 |
logger.info(f"π Translating from {source_language} to {target_language}: '{source_text}'")
|
437 |
-
|
438 |
# Special handling for pam-fil, fil-pam, pam-tgl and tgl-pam using the phi model
|
439 |
use_phi_model = False
|
440 |
actual_source_code = source_code
|
441 |
actual_target_code = target_code
|
442 |
-
|
443 |
# Check if we need to use the phi model with fil replacement
|
444 |
if (source_code == "pam" and target_code == "fil") or (source_code == "fil" and target_code == "pam"):
|
445 |
use_phi_model = True
|
@@ -449,38 +455,38 @@ def translate_text():
|
|
449 |
elif (source_code == "tgl" and target_code == "pam"):
|
450 |
use_phi_model = True
|
451 |
actual_source_code = "fil" # Replace tgl with fil for the phi model
|
452 |
-
|
453 |
if use_phi_model:
|
454 |
model_key = "phi"
|
455 |
-
|
456 |
# Check if we have the phi model
|
457 |
if model_key not in translation_models or translation_models[model_key] is None:
|
458 |
logger.error(f"β Translation model for {model_key} not loaded")
|
459 |
return jsonify({"error": f"Translation model not available"}), 503
|
460 |
-
|
461 |
try:
|
462 |
# Get the phi model and tokenizer
|
463 |
model = translation_models[model_key]
|
464 |
tokenizer = translation_tokenizers[model_key]
|
465 |
-
|
466 |
# Prepend target language token to input
|
467 |
input_text = f">>{actual_target_code}<< {source_text}"
|
468 |
-
|
469 |
logger.info(f"π Using phi model with input: '{input_text}'")
|
470 |
-
|
471 |
# Tokenize the text
|
472 |
tokenized = tokenizer(input_text, return_tensors="pt", padding=True)
|
473 |
tokenized = {k: v.to(device) for k, v in tokenized.items()}
|
474 |
-
|
475 |
# Generate translation
|
476 |
with torch.no_grad():
|
477 |
translated = model.generate(**tokenized)
|
478 |
-
|
479 |
# Decode the translation
|
480 |
result = tokenizer.decode(translated[0], skip_special_tokens=True)
|
481 |
-
|
482 |
logger.info(f"β
Translation result: '{result}'")
|
483 |
-
|
484 |
return jsonify({
|
485 |
"translated_text": result,
|
486 |
"source_language": source_language,
|
@@ -493,34 +499,35 @@ def translate_text():
|
|
493 |
else:
|
494 |
# Create the regular language pair key for other language pairs
|
495 |
lang_pair = f"{source_code}-{target_code}"
|
496 |
-
|
497 |
# Check if we have a model for this language pair
|
498 |
if lang_pair not in translation_models:
|
499 |
logger.warning(f"β οΈ No translation model available for {lang_pair}")
|
500 |
-
return jsonify(
|
501 |
-
|
|
|
502 |
if translation_models[lang_pair] is None or translation_tokenizers[lang_pair] is None:
|
503 |
logger.error(f"β Translation model for {lang_pair} not loaded")
|
504 |
return jsonify({"error": f"Translation model not available"}), 503
|
505 |
-
|
506 |
try:
|
507 |
# Regular translation process for other language pairs
|
508 |
model = translation_models[lang_pair]
|
509 |
tokenizer = translation_tokenizers[lang_pair]
|
510 |
-
|
511 |
# Tokenize the text
|
512 |
tokenized = tokenizer(source_text, return_tensors="pt", padding=True)
|
513 |
tokenized = {k: v.to(device) for k, v in tokenized.items()}
|
514 |
-
|
515 |
# Generate translation
|
516 |
with torch.no_grad():
|
517 |
translated = model.generate(**tokenized)
|
518 |
-
|
519 |
# Decode the translation
|
520 |
result = tokenizer.decode(translated[0], skip_special_tokens=True)
|
521 |
-
|
522 |
logger.info(f"β
Translation result: '{result}'")
|
523 |
-
|
524 |
return jsonify({
|
525 |
"translated_text": result,
|
526 |
"source_language": source_language,
|
@@ -530,30 +537,33 @@ def translate_text():
|
|
530 |
logger.error(f"β Translation processing failed: {str(e)}")
|
531 |
logger.debug(f"Stack trace: {traceback.format_exc()}")
|
532 |
return jsonify({"error": f"Translation processing failed: {str(e)}"}), 500
|
533 |
-
|
534 |
except Exception as e:
|
535 |
logger.error(f"β Unhandled exception in translation endpoint: {str(e)}")
|
536 |
logger.debug(f"Stack trace: {traceback.format_exc()}")
|
537 |
return jsonify({"error": f"Internal server error: {str(e)}"}), 500
|
538 |
|
|
|
539 |
# Add this function to your app.py
|
540 |
def calculate_similarity(text1, text2):
|
541 |
"""Calculate text similarity percentage."""
|
|
|
542 |
def clean_text(text):
|
543 |
return text.lower()
|
544 |
-
|
545 |
clean1 = clean_text(text1)
|
546 |
clean2 = clean_text(text2)
|
547 |
-
|
548 |
matcher = SequenceMatcher(None, clean1, clean2)
|
549 |
return matcher.ratio() * 100
|
550 |
|
|
|
551 |
@app.route("/evaluate", methods=["POST"])
|
552 |
def evaluate_pronunciation():
|
553 |
if asr_model is None or asr_processor is None:
|
554 |
logger.error("β Evaluation endpoint called but ASR models aren't loaded")
|
555 |
return jsonify({"error": "ASR model not available"}), 503
|
556 |
-
|
557 |
try:
|
558 |
if "audio" not in request.files:
|
559 |
logger.warning("β οΈ Evaluation request missing audio file")
|
@@ -570,17 +580,25 @@ def evaluate_pronunciation():
|
|
570 |
|
571 |
# Construct full reference directory path
|
572 |
reference_dir = os.path.join(REFERENCE_AUDIO_DIR, reference_locator)
|
|
|
|
|
573 |
if not os.path.exists(reference_dir):
|
574 |
logger.warning(f"β οΈ Reference directory not found: {reference_dir}")
|
575 |
return jsonify({"error": f"Reference audio directory not found: {reference_locator}"}), 404
|
576 |
|
577 |
reference_files = glob.glob(os.path.join(reference_dir, "*.wav"))
|
|
|
|
|
578 |
if not reference_files:
|
579 |
logger.warning(f"β οΈ No reference audio files found in {reference_dir}")
|
580 |
return jsonify({"error": f"No reference audio found for {reference_locator}"}), 404
|
581 |
|
|
|
|
|
|
|
|
|
582 |
lang_code = LANGUAGE_CODES.get(language, language)
|
583 |
-
logger.info(f"π Evaluating pronunciation for reference: {reference_locator}")
|
584 |
|
585 |
# Save the uploaded file temporarily
|
586 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
|
@@ -590,22 +608,31 @@ def evaluate_pronunciation():
|
|
590 |
|
591 |
# Convert to WAV if necessary and ensure correct format
|
592 |
try:
|
593 |
-
|
594 |
-
|
595 |
-
|
|
|
|
|
596 |
# Save processed audio
|
597 |
processed_path = os.path.join(OUTPUT_DIR, "processed_user_audio.wav")
|
598 |
-
|
599 |
logger.debug(f"π Processed user audio saved to {processed_path}")
|
600 |
-
|
|
|
|
|
|
|
|
|
|
|
601 |
# Update user_audio_path to processed file
|
602 |
user_audio_path = processed_path
|
603 |
except Exception as e:
|
604 |
logger.error(f"β Audio processing failed: {str(e)}")
|
|
|
605 |
return jsonify({"error": f"Audio processing failed: {str(e)}"}), 500
|
606 |
|
607 |
# Transcribe user audio
|
608 |
try:
|
|
|
609 |
# Process audio for ASR
|
610 |
inputs = asr_processor(
|
611 |
user_waveform,
|
@@ -614,14 +641,14 @@ def evaluate_pronunciation():
|
|
614 |
language=lang_code
|
615 |
)
|
616 |
inputs = {k: v.to(device) for k, v in inputs.items()}
|
617 |
-
|
618 |
# Perform ASR
|
619 |
with torch.no_grad():
|
620 |
logits = asr_model(**inputs).logits
|
621 |
ids = torch.argmax(logits, dim=-1)[0]
|
622 |
user_transcription = asr_processor.decode(ids)
|
623 |
-
|
624 |
-
logger.info(f"β
User transcription: {user_transcription}")
|
625 |
except Exception as e:
|
626 |
logger.error(f"β ASR inference failed: {str(e)}")
|
627 |
return jsonify({"error": f"ASR inference failed: {str(e)}"}), 500
|
@@ -631,13 +658,23 @@ def evaluate_pronunciation():
|
|
631 |
best_score = 0
|
632 |
best_reference = None
|
633 |
best_transcription = None
|
634 |
-
|
|
|
|
|
635 |
for ref_file in reference_files:
|
636 |
try:
|
637 |
-
|
638 |
-
|
639 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
640 |
# Transcribe reference audio
|
|
|
641 |
inputs = asr_processor(
|
642 |
ref_waveform,
|
643 |
sampling_rate=SAMPLE_RATE,
|
@@ -645,41 +682,44 @@ def evaluate_pronunciation():
|
|
645 |
language=lang_code
|
646 |
)
|
647 |
inputs = {k: v.to(device) for k, v in inputs.items()}
|
648 |
-
|
649 |
with torch.no_grad():
|
650 |
logits = asr_model(**inputs).logits
|
651 |
ids = torch.argmax(logits, dim=-1)[0]
|
652 |
ref_transcription = asr_processor.decode(ids)
|
653 |
-
|
|
|
654 |
# Calculate similarity
|
655 |
similarity = calculate_similarity(user_transcription, ref_transcription)
|
656 |
-
|
|
|
657 |
results.append({
|
658 |
"reference_file": os.path.basename(ref_file),
|
659 |
"reference_text": ref_transcription,
|
660 |
"similarity_score": similarity
|
661 |
})
|
662 |
-
|
663 |
if similarity > best_score:
|
664 |
best_score = similarity
|
665 |
best_reference = os.path.basename(ref_file)
|
666 |
best_transcription = ref_transcription
|
667 |
-
|
668 |
-
logger.debug(f"π Reference '{os.path.basename(ref_file)}': {similarity:.2f}%")
|
669 |
except Exception as e:
|
670 |
logger.error(f"β Error processing reference audio {ref_file}: {str(e)}")
|
671 |
-
|
|
|
672 |
# Clean up temp files
|
673 |
try:
|
674 |
if os.path.exists(user_audio_path) and user_audio_path != processed_path:
|
675 |
os.unlink(user_audio_path)
|
|
|
676 |
except Exception as e:
|
677 |
logger.warning(f"β οΈ Failed to clean up temp files: {str(e)}")
|
678 |
-
|
679 |
# Enhanced feedback based on score range
|
680 |
is_correct = best_score >= 70.0
|
681 |
feedback = ""
|
682 |
-
|
683 |
if best_score >= 90.0:
|
684 |
feedback = "Perfect pronunciation! Excellent job!"
|
685 |
elif best_score >= 80.0:
|
@@ -690,10 +730,13 @@ def evaluate_pronunciation():
|
|
690 |
feedback = "Fair attempt. Try focusing on the syllables that differ from the sample."
|
691 |
else:
|
692 |
feedback = "Try again. Listen carefully to the sample pronunciation."
|
693 |
-
|
|
|
|
|
|
|
694 |
# Sort results by score descending
|
695 |
results.sort(key=lambda x: x["similarity_score"], reverse=True)
|
696 |
-
|
697 |
return jsonify({
|
698 |
"is_correct": is_correct,
|
699 |
"score": best_score,
|
@@ -703,7 +746,7 @@ def evaluate_pronunciation():
|
|
703 |
"reference_locator": reference_locator,
|
704 |
"details": results
|
705 |
})
|
706 |
-
|
707 |
except Exception as e:
|
708 |
logger.error(f"β Unhandled exception in evaluation endpoint: {str(e)}")
|
709 |
logger.debug(f"Stack trace: {traceback.format_exc()}")
|
@@ -723,10 +766,10 @@ def upload_reference_audio():
|
|
723 |
|
724 |
# Validate reference word
|
725 |
reference_patterns = [
|
726 |
-
"mayap_a_abak", "mayap_a_ugtu", "mayap_a_gatpanapun",
|
727 |
"mayap_a_bengi", "komusta_ka"
|
728 |
]
|
729 |
-
|
730 |
if reference_word not in reference_patterns:
|
731 |
logger.warning(f"β οΈ Invalid reference word: {reference_word}")
|
732 |
return jsonify({"error": f"Invalid reference word. Available: {reference_patterns}"}), 400
|
@@ -771,21 +814,22 @@ def upload_reference_audio():
|
|
771 |
logger.debug(f"Stack trace: {traceback.format_exc()}")
|
772 |
return jsonify({"error": f"Internal server error: {str(e)}"}), 500
|
773 |
|
|
|
774 |
def init_reference_audio():
|
775 |
try:
|
776 |
# Create the output directory first
|
777 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
778 |
logger.info(f"π Created output directory: {OUTPUT_DIR}")
|
779 |
-
|
780 |
# Check if the reference audio directory exists in the repository
|
781 |
if os.path.exists(REFERENCE_AUDIO_DIR):
|
782 |
logger.info(f"β
Found reference audio directory: {REFERENCE_AUDIO_DIR}")
|
783 |
-
|
784 |
# Log the contents to verify
|
785 |
-
pattern_dirs = [d for d in os.listdir(REFERENCE_AUDIO_DIR)
|
786 |
-
|
787 |
logger.info(f"π Found reference patterns: {pattern_dirs}")
|
788 |
-
|
789 |
# Check each pattern directory for wav files
|
790 |
for pattern_dir_name in pattern_dirs:
|
791 |
pattern_path = os.path.join(REFERENCE_AUDIO_DIR, pattern_dir_name)
|
@@ -796,6 +840,7 @@ def init_reference_audio():
|
|
796 |
except Exception as e:
|
797 |
logger.error(f"β Failed to set up reference audio directory: {str(e)}")
|
798 |
|
|
|
799 |
# Add an initialization route that will be called before the first request
|
800 |
@app.before_request
|
801 |
def before_request():
|
@@ -804,12 +849,11 @@ def before_request():
|
|
804 |
g.initialized = True
|
805 |
|
806 |
|
807 |
-
|
808 |
if __name__ == "__main__":
|
809 |
init_reference_audio()
|
810 |
logger.info("π Starting Speech API server")
|
811 |
logger.info(f"π System status: ASR model: {'β
' if asr_model else 'β'}")
|
812 |
for lang, model in tts_models.items():
|
813 |
logger.info(f"π TTS model {lang}: {'β
' if model else 'β'}")
|
814 |
-
|
815 |
app.run(host="0.0.0.0", port=7860, debug=True)
|
|
|
4 |
import logging
|
5 |
import traceback
|
6 |
|
|
|
7 |
# Configure logging
|
8 |
logging.basicConfig(
|
9 |
level=logging.INFO,
|
|
|
46 |
from transformers import Wav2Vec2ForCTC, AutoProcessor, VitsModel, AutoTokenizer
|
47 |
from transformers import MarianMTModel, MarianTokenizer
|
48 |
from werkzeug.utils import secure_filename
|
49 |
+
|
50 |
logger.info("β
All required libraries imported successfully")
|
51 |
except ImportError as e:
|
52 |
logger.critical(f"β Failed to import necessary libraries: {str(e)}")
|
|
|
76 |
cache_dir=cache_dirs["TRANSFORMERS_CACHE"]
|
77 |
)
|
78 |
logger.info("β
ASR processor loaded successfully")
|
79 |
+
|
80 |
asr_model = Wav2Vec2ForCTC.from_pretrained(
|
81 |
ASR_MODEL_ID,
|
82 |
cache_dir=cache_dirs["TRANSFORMERS_CACHE"]
|
|
|
112 |
logger.info(f"π Loading TTS model for {lang}: {model_id}")
|
113 |
try:
|
114 |
tts_processors[lang] = AutoTokenizer.from_pretrained(
|
115 |
+
model_id,
|
116 |
cache_dir=cache_dirs["TRANSFORMERS_CACHE"]
|
117 |
)
|
118 |
logger.info(f"β
{lang} TTS processor loaded")
|
119 |
+
|
120 |
tts_models[lang] = VitsModel.from_pretrained(
|
121 |
+
model_id,
|
122 |
cache_dir=cache_dirs["TRANSFORMERS_CACHE"]
|
123 |
)
|
124 |
tts_models[lang].to(device)
|
|
|
134 |
"eng-pam": "Coco-18/opus-mt-en-pam",
|
135 |
"tgl-eng": "Helsinki-NLP/opus-mt-tl-en",
|
136 |
"eng-tgl": "Helsinki-NLP/opus-mt-en-tl",
|
137 |
+
"phi": "Coco-18/opus-mt-phi"
|
138 |
}
|
139 |
|
140 |
logger.info(f"π Loading Translation model: {TRANSLATION_MODELS}")
|
|
|
145 |
|
146 |
for model_key, model_id in TRANSLATION_MODELS.items():
|
147 |
logger.info(f"π Loading Translation model: {model_id}")
|
148 |
+
|
149 |
try:
|
150 |
translation_tokenizers[model_key] = MarianTokenizer.from_pretrained(
|
151 |
model_id,
|
152 |
cache_dir=cache_dirs["TRANSFORMERS_CACHE"]
|
153 |
)
|
154 |
logger.info(f"β
Translation tokenizer loaded successfully for {model_key}")
|
155 |
+
|
156 |
translation_models[model_key] = MarianMTModel.from_pretrained(
|
157 |
model_id,
|
158 |
cache_dir=cache_dirs["TRANSFORMERS_CACHE"]
|
|
|
168 |
# Constants
|
169 |
SAMPLE_RATE = 16000
|
170 |
OUTPUT_DIR = "/tmp/audio_outputs"
|
171 |
+
REFERENCE_AUDIO_DIR = "./reference_audios"
|
172 |
|
173 |
try:
|
174 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
|
|
176 |
except Exception as e:
|
177 |
logger.error(f"β Failed to create output directory: {str(e)}")
|
178 |
|
179 |
+
|
180 |
@app.route("/", methods=["GET"])
|
181 |
def home():
|
182 |
return jsonify({"message": "Speech API is running", "status": "active"})
|
183 |
|
184 |
+
|
185 |
@app.route("/health", methods=["GET"])
|
186 |
def health_check():
|
187 |
# Initialize direct language pair statuses based on loaded models
|
188 |
translation_status = {}
|
189 |
+
|
190 |
# Add status for direct model pairs
|
191 |
for lang_pair in ["pam-eng", "eng-pam", "tgl-eng", "eng-tgl"]:
|
192 |
+
translation_status[lang_pair] = "loaded" if lang_pair in translation_models and translation_models[
|
193 |
+
lang_pair] is not None else "failed"
|
194 |
+
|
195 |
# Add special phi model status
|
196 |
phi_status = "loaded" if "phi" in translation_models and translation_models["phi"] is not None else "failed"
|
197 |
translation_status["pam-fil"] = phi_status
|
198 |
translation_status["fil-pam"] = phi_status
|
199 |
translation_status["pam-tgl"] = phi_status # Using phi model but replacing tgl with fil
|
200 |
translation_status["tgl-pam"] = phi_status # Using phi model but replacing tgl with fil
|
201 |
+
|
202 |
health_status = {
|
203 |
"api_status": "online",
|
204 |
"asr_model": "loaded" if asr_model is not None else "failed",
|
205 |
+
"tts_models": {lang: "loaded" if model is not None else "failed"
|
206 |
+
for lang, model in tts_models.items()},
|
207 |
"translation_models": translation_status,
|
208 |
"device": device
|
209 |
}
|
210 |
return jsonify(health_status)
|
211 |
|
212 |
+
|
213 |
@app.route("/check_references", methods=["GET"])
|
214 |
def check_references():
|
215 |
"""Endpoint to check if reference files exist and are accessible"""
|
216 |
+
ref_patterns = ["mayap_a_abak", "mayap_a_ugtu", "mayap_a_gatpanapun",
|
217 |
"mayap_a_bengi", "komusta_ka"]
|
218 |
results = {}
|
219 |
+
|
220 |
for pattern in ref_patterns:
|
221 |
pattern_dir = os.path.join(REFERENCE_AUDIO_DIR, pattern)
|
222 |
if os.path.exists(pattern_dir):
|
|
|
232 |
"exists": False,
|
233 |
"path": pattern_dir
|
234 |
}
|
235 |
+
|
236 |
return jsonify({
|
237 |
"reference_audio_dir": REFERENCE_AUDIO_DIR,
|
238 |
"directory_exists": os.path.exists(REFERENCE_AUDIO_DIR),
|
239 |
"patterns": results
|
240 |
})
|
241 |
|
242 |
+
|
243 |
@app.route("/asr", methods=["POST"])
|
244 |
def transcribe_audio():
|
245 |
if asr_model is None or asr_processor is None:
|
246 |
logger.error("β ASR endpoint called but models aren't loaded")
|
247 |
return jsonify({"error": "ASR model not available"}), 503
|
248 |
+
|
249 |
try:
|
250 |
if "audio" not in request.files:
|
251 |
logger.warning("β οΈ ASR request missing audio file")
|
|
|
256 |
|
257 |
if language not in LANGUAGE_CODES:
|
258 |
logger.warning(f"β οΈ Unsupported language requested: {language}")
|
259 |
+
return jsonify(
|
260 |
+
{"error": f"Unsupported language: {language}. Available: {list(LANGUAGE_CODES.keys())}"}), 400
|
261 |
|
262 |
lang_code = LANGUAGE_CODES[language]
|
263 |
logger.info(f"π Processing {language} audio for ASR")
|
|
|
315 |
logits = asr_model(**inputs).logits
|
316 |
ids = torch.argmax(logits, dim=-1)[0]
|
317 |
transcription = asr_processor.decode(ids)
|
318 |
+
|
319 |
logger.info(f"β
Transcription ({language}): {transcription}")
|
320 |
+
|
321 |
# Clean up temp files
|
322 |
try:
|
323 |
os.unlink(temp_audio_path)
|
|
|
325 |
os.unlink(wav_path)
|
326 |
except Exception as e:
|
327 |
logger.warning(f"β οΈ Failed to clean up temp files: {str(e)}")
|
328 |
+
|
329 |
return jsonify({
|
330 |
"transcription": transcription,
|
331 |
"language": language,
|
|
|
349 |
if not data:
|
350 |
logger.warning("β οΈ TTS endpoint called with no JSON data")
|
351 |
return jsonify({"error": "No JSON data provided"}), 400
|
352 |
+
|
353 |
text_input = data.get("text", "").strip()
|
354 |
language = data.get("language", "kapampangan").lower()
|
355 |
|
356 |
if not text_input:
|
357 |
logger.warning("β οΈ TTS request with empty text")
|
358 |
return jsonify({"error": "No text provided"}), 400
|
359 |
+
|
360 |
if language not in TTS_MODELS:
|
361 |
logger.warning(f"β οΈ TTS requested for unsupported language: {language}")
|
362 |
return jsonify({"error": f"Invalid language. Available options: {list(TTS_MODELS.keys())}"}), 400
|
363 |
+
|
364 |
if tts_models[language] is None:
|
365 |
logger.error(f"β TTS model for {language} not loaded")
|
366 |
return jsonify({"error": f"TTS model for {language} not available"}), 503
|
367 |
|
368 |
logger.info(f"π Generating TTS for language: {language}, text: '{text_input}'")
|
369 |
+
|
370 |
try:
|
371 |
processor = tts_processors[language]
|
372 |
model = tts_models[language]
|
|
|
414 |
if os.path.exists(file_path):
|
415 |
logger.info(f"π€ Serving audio file: {file_path}")
|
416 |
return send_file(file_path, mimetype="audio/wav", as_attachment=True)
|
417 |
+
|
418 |
logger.warning(f"β οΈ Requested file not found: {file_path}")
|
419 |
return jsonify({"error": "File not found"}), 404
|
420 |
|
421 |
+
|
422 |
@app.route("/translate", methods=["POST"])
|
423 |
def translate_text():
|
424 |
try:
|
|
|
426 |
if not data:
|
427 |
logger.warning("β οΈ Translation endpoint called with no JSON data")
|
428 |
return jsonify({"error": "No JSON data provided"}), 400
|
429 |
+
|
430 |
source_text = data.get("text", "").strip()
|
431 |
source_language = data.get("source_language", "").lower()
|
432 |
target_language = data.get("target_language", "").lower()
|
|
|
434 |
if not source_text:
|
435 |
logger.warning("β οΈ Translation request with empty text")
|
436 |
return jsonify({"error": "No text provided"}), 400
|
437 |
+
|
438 |
# Map language names to codes
|
439 |
source_code = LANGUAGE_CODES.get(source_language, source_language)
|
440 |
target_code = LANGUAGE_CODES.get(target_language, target_language)
|
441 |
+
|
442 |
logger.info(f"π Translating from {source_language} to {target_language}: '{source_text}'")
|
443 |
+
|
444 |
# Special handling for pam-fil, fil-pam, pam-tgl and tgl-pam using the phi model
|
445 |
use_phi_model = False
|
446 |
actual_source_code = source_code
|
447 |
actual_target_code = target_code
|
448 |
+
|
449 |
# Check if we need to use the phi model with fil replacement
|
450 |
if (source_code == "pam" and target_code == "fil") or (source_code == "fil" and target_code == "pam"):
|
451 |
use_phi_model = True
|
|
|
455 |
elif (source_code == "tgl" and target_code == "pam"):
|
456 |
use_phi_model = True
|
457 |
actual_source_code = "fil" # Replace tgl with fil for the phi model
|
458 |
+
|
459 |
if use_phi_model:
|
460 |
model_key = "phi"
|
461 |
+
|
462 |
# Check if we have the phi model
|
463 |
if model_key not in translation_models or translation_models[model_key] is None:
|
464 |
logger.error(f"β Translation model for {model_key} not loaded")
|
465 |
return jsonify({"error": f"Translation model not available"}), 503
|
466 |
+
|
467 |
try:
|
468 |
# Get the phi model and tokenizer
|
469 |
model = translation_models[model_key]
|
470 |
tokenizer = translation_tokenizers[model_key]
|
471 |
+
|
472 |
# Prepend target language token to input
|
473 |
input_text = f">>{actual_target_code}<< {source_text}"
|
474 |
+
|
475 |
logger.info(f"π Using phi model with input: '{input_text}'")
|
476 |
+
|
477 |
# Tokenize the text
|
478 |
tokenized = tokenizer(input_text, return_tensors="pt", padding=True)
|
479 |
tokenized = {k: v.to(device) for k, v in tokenized.items()}
|
480 |
+
|
481 |
# Generate translation
|
482 |
with torch.no_grad():
|
483 |
translated = model.generate(**tokenized)
|
484 |
+
|
485 |
# Decode the translation
|
486 |
result = tokenizer.decode(translated[0], skip_special_tokens=True)
|
487 |
+
|
488 |
logger.info(f"β
Translation result: '{result}'")
|
489 |
+
|
490 |
return jsonify({
|
491 |
"translated_text": result,
|
492 |
"source_language": source_language,
|
|
|
499 |
else:
|
500 |
# Create the regular language pair key for other language pairs
|
501 |
lang_pair = f"{source_code}-{target_code}"
|
502 |
+
|
503 |
# Check if we have a model for this language pair
|
504 |
if lang_pair not in translation_models:
|
505 |
logger.warning(f"β οΈ No translation model available for {lang_pair}")
|
506 |
+
return jsonify(
|
507 |
+
{"error": f"Translation from {source_language} to {target_language} is not supported yet"}), 400
|
508 |
+
|
509 |
if translation_models[lang_pair] is None or translation_tokenizers[lang_pair] is None:
|
510 |
logger.error(f"β Translation model for {lang_pair} not loaded")
|
511 |
return jsonify({"error": f"Translation model not available"}), 503
|
512 |
+
|
513 |
try:
|
514 |
# Regular translation process for other language pairs
|
515 |
model = translation_models[lang_pair]
|
516 |
tokenizer = translation_tokenizers[lang_pair]
|
517 |
+
|
518 |
# Tokenize the text
|
519 |
tokenized = tokenizer(source_text, return_tensors="pt", padding=True)
|
520 |
tokenized = {k: v.to(device) for k, v in tokenized.items()}
|
521 |
+
|
522 |
# Generate translation
|
523 |
with torch.no_grad():
|
524 |
translated = model.generate(**tokenized)
|
525 |
+
|
526 |
# Decode the translation
|
527 |
result = tokenizer.decode(translated[0], skip_special_tokens=True)
|
528 |
+
|
529 |
logger.info(f"β
Translation result: '{result}'")
|
530 |
+
|
531 |
return jsonify({
|
532 |
"translated_text": result,
|
533 |
"source_language": source_language,
|
|
|
537 |
logger.error(f"β Translation processing failed: {str(e)}")
|
538 |
logger.debug(f"Stack trace: {traceback.format_exc()}")
|
539 |
return jsonify({"error": f"Translation processing failed: {str(e)}"}), 500
|
540 |
+
|
541 |
except Exception as e:
|
542 |
logger.error(f"β Unhandled exception in translation endpoint: {str(e)}")
|
543 |
logger.debug(f"Stack trace: {traceback.format_exc()}")
|
544 |
return jsonify({"error": f"Internal server error: {str(e)}"}), 500
|
545 |
|
546 |
+
|
547 |
# Add this function to your app.py
|
548 |
def calculate_similarity(text1, text2):
|
549 |
"""Calculate text similarity percentage."""
|
550 |
+
|
551 |
def clean_text(text):
|
552 |
return text.lower()
|
553 |
+
|
554 |
clean1 = clean_text(text1)
|
555 |
clean2 = clean_text(text2)
|
556 |
+
|
557 |
matcher = SequenceMatcher(None, clean1, clean2)
|
558 |
return matcher.ratio() * 100
|
559 |
|
560 |
+
|
561 |
@app.route("/evaluate", methods=["POST"])
|
562 |
def evaluate_pronunciation():
|
563 |
if asr_model is None or asr_processor is None:
|
564 |
logger.error("β Evaluation endpoint called but ASR models aren't loaded")
|
565 |
return jsonify({"error": "ASR model not available"}), 503
|
566 |
+
|
567 |
try:
|
568 |
if "audio" not in request.files:
|
569 |
logger.warning("β οΈ Evaluation request missing audio file")
|
|
|
580 |
|
581 |
# Construct full reference directory path
|
582 |
reference_dir = os.path.join(REFERENCE_AUDIO_DIR, reference_locator)
|
583 |
+
logger.info(f"π Reference directory path: {reference_dir}")
|
584 |
+
|
585 |
if not os.path.exists(reference_dir):
|
586 |
logger.warning(f"β οΈ Reference directory not found: {reference_dir}")
|
587 |
return jsonify({"error": f"Reference audio directory not found: {reference_locator}"}), 404
|
588 |
|
589 |
reference_files = glob.glob(os.path.join(reference_dir, "*.wav"))
|
590 |
+
logger.info(f"π Reference files found: {len(reference_files)}")
|
591 |
+
|
592 |
if not reference_files:
|
593 |
logger.warning(f"β οΈ No reference audio files found in {reference_dir}")
|
594 |
return jsonify({"error": f"No reference audio found for {reference_locator}"}), 404
|
595 |
|
596 |
+
# Log actual file paths for debugging
|
597 |
+
for ref_file in reference_files:
|
598 |
+
logger.debug(f"π Reference file: {ref_file}")
|
599 |
+
|
600 |
lang_code = LANGUAGE_CODES.get(language, language)
|
601 |
+
logger.info(f"π Evaluating pronunciation for reference: {reference_locator} with language code: {lang_code}")
|
602 |
|
603 |
# Save the uploaded file temporarily
|
604 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
|
|
|
608 |
|
609 |
# Convert to WAV if necessary and ensure correct format
|
610 |
try:
|
611 |
+
logger.info(f"π Processing user audio file")
|
612 |
+
# First try using pydub for consistent processing
|
613 |
+
audio = AudioSegment.from_file(user_audio_path)
|
614 |
+
audio = audio.set_frame_rate(SAMPLE_RATE).set_channels(1)
|
615 |
+
|
616 |
# Save processed audio
|
617 |
processed_path = os.path.join(OUTPUT_DIR, "processed_user_audio.wav")
|
618 |
+
audio.export(processed_path, format="wav")
|
619 |
logger.debug(f"π Processed user audio saved to {processed_path}")
|
620 |
+
|
621 |
+
# Load the processed audio for ASR
|
622 |
+
user_waveform, sr = torchaudio.load(processed_path)
|
623 |
+
user_waveform = user_waveform.squeeze().numpy()
|
624 |
+
logger.info(f"β
User audio processed successfully: {sr}Hz, length: {len(user_waveform)} samples")
|
625 |
+
|
626 |
# Update user_audio_path to processed file
|
627 |
user_audio_path = processed_path
|
628 |
except Exception as e:
|
629 |
logger.error(f"β Audio processing failed: {str(e)}")
|
630 |
+
logger.debug(f"Stack trace: {traceback.format_exc()}")
|
631 |
return jsonify({"error": f"Audio processing failed: {str(e)}"}), 500
|
632 |
|
633 |
# Transcribe user audio
|
634 |
try:
|
635 |
+
logger.info(f"π Transcribing user audio")
|
636 |
# Process audio for ASR
|
637 |
inputs = asr_processor(
|
638 |
user_waveform,
|
|
|
641 |
language=lang_code
|
642 |
)
|
643 |
inputs = {k: v.to(device) for k, v in inputs.items()}
|
644 |
+
|
645 |
# Perform ASR
|
646 |
with torch.no_grad():
|
647 |
logits = asr_model(**inputs).logits
|
648 |
ids = torch.argmax(logits, dim=-1)[0]
|
649 |
user_transcription = asr_processor.decode(ids)
|
650 |
+
|
651 |
+
logger.info(f"β
User transcription: '{user_transcription}'")
|
652 |
except Exception as e:
|
653 |
logger.error(f"β ASR inference failed: {str(e)}")
|
654 |
return jsonify({"error": f"ASR inference failed: {str(e)}"}), 500
|
|
|
658 |
best_score = 0
|
659 |
best_reference = None
|
660 |
best_transcription = None
|
661 |
+
|
662 |
+
logger.info(f"π Beginning comparison with {len(reference_files)} reference files")
|
663 |
+
|
664 |
for ref_file in reference_files:
|
665 |
try:
|
666 |
+
logger.info(f"π Processing reference file: {os.path.basename(ref_file)}")
|
667 |
+
|
668 |
+
# Load reference audio using torchaudio instead of librosa
|
669 |
+
ref_waveform, ref_sr = torchaudio.load(ref_file)
|
670 |
+
if ref_sr != SAMPLE_RATE:
|
671 |
+
logger.debug(f"π Resampling reference audio from {ref_sr}Hz to {SAMPLE_RATE}Hz")
|
672 |
+
ref_waveform = torchaudio.transforms.Resample(ref_sr, SAMPLE_RATE)(ref_waveform)
|
673 |
+
ref_waveform = ref_waveform.squeeze().numpy()
|
674 |
+
logger.debug(f"β
Reference audio loaded: {len(ref_waveform)} samples")
|
675 |
+
|
676 |
# Transcribe reference audio
|
677 |
+
logger.debug(f"π Transcribing reference audio")
|
678 |
inputs = asr_processor(
|
679 |
ref_waveform,
|
680 |
sampling_rate=SAMPLE_RATE,
|
|
|
682 |
language=lang_code
|
683 |
)
|
684 |
inputs = {k: v.to(device) for k, v in inputs.items()}
|
685 |
+
|
686 |
with torch.no_grad():
|
687 |
logits = asr_model(**inputs).logits
|
688 |
ids = torch.argmax(logits, dim=-1)[0]
|
689 |
ref_transcription = asr_processor.decode(ids)
|
690 |
+
logger.info(f"β
Reference transcription: '{ref_transcription}'")
|
691 |
+
|
692 |
# Calculate similarity
|
693 |
similarity = calculate_similarity(user_transcription, ref_transcription)
|
694 |
+
logger.info(f"π Similarity with {os.path.basename(ref_file)}: {similarity:.2f}%")
|
695 |
+
|
696 |
results.append({
|
697 |
"reference_file": os.path.basename(ref_file),
|
698 |
"reference_text": ref_transcription,
|
699 |
"similarity_score": similarity
|
700 |
})
|
701 |
+
|
702 |
if similarity > best_score:
|
703 |
best_score = similarity
|
704 |
best_reference = os.path.basename(ref_file)
|
705 |
best_transcription = ref_transcription
|
706 |
+
logger.info(f"π New best match: {best_reference} with score {best_score:.2f}%")
|
|
|
707 |
except Exception as e:
|
708 |
logger.error(f"β Error processing reference audio {ref_file}: {str(e)}")
|
709 |
+
logger.debug(f"Stack trace: {traceback.format_exc()}")
|
710 |
+
|
711 |
# Clean up temp files
|
712 |
try:
|
713 |
if os.path.exists(user_audio_path) and user_audio_path != processed_path:
|
714 |
os.unlink(user_audio_path)
|
715 |
+
logger.debug(f"π§Ή Cleaned up temporary file: {user_audio_path}")
|
716 |
except Exception as e:
|
717 |
logger.warning(f"β οΈ Failed to clean up temp files: {str(e)}")
|
718 |
+
|
719 |
# Enhanced feedback based on score range
|
720 |
is_correct = best_score >= 70.0
|
721 |
feedback = ""
|
722 |
+
|
723 |
if best_score >= 90.0:
|
724 |
feedback = "Perfect pronunciation! Excellent job!"
|
725 |
elif best_score >= 80.0:
|
|
|
730 |
feedback = "Fair attempt. Try focusing on the syllables that differ from the sample."
|
731 |
else:
|
732 |
feedback = "Try again. Listen carefully to the sample pronunciation."
|
733 |
+
|
734 |
+
logger.info(f"π Final evaluation results: score={best_score:.2f}%, is_correct={is_correct}")
|
735 |
+
logger.info(f"π Feedback: '{feedback}'")
|
736 |
+
|
737 |
# Sort results by score descending
|
738 |
results.sort(key=lambda x: x["similarity_score"], reverse=True)
|
739 |
+
|
740 |
return jsonify({
|
741 |
"is_correct": is_correct,
|
742 |
"score": best_score,
|
|
|
746 |
"reference_locator": reference_locator,
|
747 |
"details": results
|
748 |
})
|
749 |
+
|
750 |
except Exception as e:
|
751 |
logger.error(f"β Unhandled exception in evaluation endpoint: {str(e)}")
|
752 |
logger.debug(f"Stack trace: {traceback.format_exc()}")
|
|
|
766 |
|
767 |
# Validate reference word
|
768 |
reference_patterns = [
|
769 |
+
"mayap_a_abak", "mayap_a_ugtu", "mayap_a_gatpanapun",
|
770 |
"mayap_a_bengi", "komusta_ka"
|
771 |
]
|
772 |
+
|
773 |
if reference_word not in reference_patterns:
|
774 |
logger.warning(f"β οΈ Invalid reference word: {reference_word}")
|
775 |
return jsonify({"error": f"Invalid reference word. Available: {reference_patterns}"}), 400
|
|
|
814 |
logger.debug(f"Stack trace: {traceback.format_exc()}")
|
815 |
return jsonify({"error": f"Internal server error: {str(e)}"}), 500
|
816 |
|
817 |
+
|
818 |
def init_reference_audio():
|
819 |
try:
|
820 |
# Create the output directory first
|
821 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
822 |
logger.info(f"π Created output directory: {OUTPUT_DIR}")
|
823 |
+
|
824 |
# Check if the reference audio directory exists in the repository
|
825 |
if os.path.exists(REFERENCE_AUDIO_DIR):
|
826 |
logger.info(f"β
Found reference audio directory: {REFERENCE_AUDIO_DIR}")
|
827 |
+
|
828 |
# Log the contents to verify
|
829 |
+
pattern_dirs = [d for d in os.listdir(REFERENCE_AUDIO_DIR)
|
830 |
+
if os.path.isdir(os.path.join(REFERENCE_AUDIO_DIR, d))]
|
831 |
logger.info(f"π Found reference patterns: {pattern_dirs}")
|
832 |
+
|
833 |
# Check each pattern directory for wav files
|
834 |
for pattern_dir_name in pattern_dirs:
|
835 |
pattern_path = os.path.join(REFERENCE_AUDIO_DIR, pattern_dir_name)
|
|
|
840 |
except Exception as e:
|
841 |
logger.error(f"β Failed to set up reference audio directory: {str(e)}")
|
842 |
|
843 |
+
|
844 |
# Add an initialization route that will be called before the first request
|
845 |
@app.before_request
|
846 |
def before_request():
|
|
|
849 |
g.initialized = True
|
850 |
|
851 |
|
|
|
852 |
if __name__ == "__main__":
|
853 |
init_reference_audio()
|
854 |
logger.info("π Starting Speech API server")
|
855 |
logger.info(f"π System status: ASR model: {'β
' if asr_model else 'β'}")
|
856 |
for lang, model in tts_models.items():
|
857 |
logger.info(f"π TTS model {lang}: {'β
' if model else 'β'}")
|
858 |
+
|
859 |
app.run(host="0.0.0.0", port=7860, debug=True)
|