|
|
|
import os |
|
import sys |
|
import logging |
|
import traceback |
|
|
|
|
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - %(levelname)s - %(message)s', |
|
datefmt='%Y-%m-%d %H:%M:%S' |
|
) |
|
logger = logging.getLogger("speech_api") |
|
|
|
|
|
cache_dirs = { |
|
"HF_HOME": "/tmp/hf_home", |
|
"TRANSFORMERS_CACHE": "/tmp/transformers_cache", |
|
"HUGGINGFACE_HUB_CACHE": "/tmp/huggingface_hub_cache", |
|
"TORCH_HOME": "/tmp/torch_home", |
|
"XDG_CACHE_HOME": "/tmp/xdg_cache" |
|
} |
|
|
|
|
|
for env_var, path in cache_dirs.items(): |
|
os.environ[env_var] = path |
|
try: |
|
os.makedirs(path, exist_ok=True) |
|
logger.info(f"π Created cache directory: {path}") |
|
except Exception as e: |
|
logger.error(f"β Failed to create directory {path}: {str(e)}") |
|
|
|
|
|
try: |
|
import librosa |
|
from difflib import SequenceMatcher |
|
import glob |
|
import numpy as np |
|
import torch |
|
from pydub import AudioSegment |
|
import tempfile |
|
import torchaudio |
|
import soundfile as sf |
|
from flask import Flask, request, jsonify, send_file |
|
from flask_cors import CORS |
|
from transformers import Wav2Vec2ForCTC, AutoProcessor, VitsModel, AutoTokenizer |
|
from transformers import MarianMTModel, MarianTokenizer |
|
from werkzeug.utils import secure_filename |
|
|
|
logger.info("β
All required libraries imported successfully") |
|
except ImportError as e: |
|
logger.critical(f"β Failed to import necessary libraries: {str(e)}") |
|
sys.exit(1) |
|
|
|
|
|
if torch.cuda.is_available(): |
|
logger.info(f"π CUDA available: {torch.cuda.get_device_name(0)}") |
|
device = "cuda" |
|
else: |
|
logger.info("β οΈ CUDA not available, using CPU") |
|
device = "cpu" |
|
|
|
app = Flask(__name__) |
|
CORS(app) |
|
|
|
|
|
ASR_MODEL_ID = "Coco-18/mms-asr-tgl-en-safetensor" |
|
logger.info(f"π Loading ASR model: {ASR_MODEL_ID}") |
|
|
|
asr_processor = None |
|
asr_model = None |
|
|
|
try: |
|
asr_processor = AutoProcessor.from_pretrained( |
|
ASR_MODEL_ID, |
|
cache_dir=cache_dirs["TRANSFORMERS_CACHE"] |
|
) |
|
logger.info("β
ASR processor loaded successfully") |
|
|
|
asr_model = Wav2Vec2ForCTC.from_pretrained( |
|
ASR_MODEL_ID, |
|
cache_dir=cache_dirs["TRANSFORMERS_CACHE"] |
|
) |
|
asr_model.to(device) |
|
logger.info(f"β
ASR model loaded successfully on {device}") |
|
except Exception as e: |
|
logger.error(f"β Error loading ASR model: {str(e)}") |
|
logger.debug(f"Stack trace: {traceback.format_exc()}") |
|
logger.debug(f"Python version: {sys.version}") |
|
logger.debug(f"Current working directory: {os.getcwd()}") |
|
logger.debug(f"Temp directory exists: {os.path.exists('/tmp')}") |
|
logger.debug(f"Temp directory writeable: {os.access('/tmp', os.W_OK)}") |
|
|
|
|
|
LANGUAGE_CODES = { |
|
"kapampangan": "pam", |
|
"filipino": "fil", |
|
"english": "eng", |
|
"tagalog": "tgl", |
|
} |
|
|
|
|
|
TTS_MODELS = { |
|
"kapampangan": "facebook/mms-tts-pam", |
|
"tagalog": "facebook/mms-tts-tgl", |
|
"english": "facebook/mms-tts-eng" |
|
} |
|
|
|
tts_models = {} |
|
tts_processors = {} |
|
for lang, model_id in TTS_MODELS.items(): |
|
logger.info(f"π Loading TTS model for {lang}: {model_id}") |
|
try: |
|
tts_processors[lang] = AutoTokenizer.from_pretrained( |
|
model_id, |
|
cache_dir=cache_dirs["TRANSFORMERS_CACHE"] |
|
) |
|
logger.info(f"β
{lang} TTS processor loaded") |
|
|
|
tts_models[lang] = VitsModel.from_pretrained( |
|
model_id, |
|
cache_dir=cache_dirs["TRANSFORMERS_CACHE"] |
|
) |
|
tts_models[lang].to(device) |
|
logger.info(f"β
{lang} TTS model loaded on {device}") |
|
except Exception as e: |
|
logger.error(f"β Failed to load {lang} TTS model: {str(e)}") |
|
logger.debug(f"Stack trace: {traceback.format_exc()}") |
|
tts_models[lang] = None |
|
|
|
|
|
TRANSLATION_MODELS = { |
|
"pam-eng": "Coco-18/opus-mt-pam-en", |
|
"eng-pam": "Coco-18/opus-mt-en-pam", |
|
"tgl-eng": "Helsinki-NLP/opus-mt-tl-en", |
|
"eng-tgl": "Helsinki-NLP/opus-mt-en-tl", |
|
"phi": "Coco-18/opus-mt-phi" |
|
} |
|
|
|
logger.info(f"π Loading Translation model: {TRANSLATION_MODELS}") |
|
|
|
|
|
translation_models = {} |
|
translation_tokenizers = {} |
|
|
|
for model_key, model_id in TRANSLATION_MODELS.items(): |
|
logger.info(f"π Loading Translation model: {model_id}") |
|
|
|
try: |
|
translation_tokenizers[model_key] = MarianTokenizer.from_pretrained( |
|
model_id, |
|
cache_dir=cache_dirs["TRANSFORMERS_CACHE"] |
|
) |
|
logger.info(f"β
Translation tokenizer loaded successfully for {model_key}") |
|
|
|
translation_models[model_key] = MarianMTModel.from_pretrained( |
|
model_id, |
|
cache_dir=cache_dirs["TRANSFORMERS_CACHE"] |
|
) |
|
translation_models[model_key].to(device) |
|
logger.info(f"β
Translation model loaded successfully on {device} for {model_key}") |
|
except Exception as e: |
|
logger.error(f"β Error loading Translation model for {model_key}: {str(e)}") |
|
logger.debug(f"Stack trace: {traceback.format_exc()}") |
|
translation_models[model_key] = None |
|
translation_tokenizers[model_key] = None |
|
|
|
|
|
SAMPLE_RATE = 16000 |
|
OUTPUT_DIR = "/tmp/audio_outputs" |
|
|
|
REFERENCE_AUDIO_DIR = "/storage/reference_audio" |
|
|
|
try: |
|
os.makedirs(OUTPUT_DIR, exist_ok=True) |
|
logger.info(f"π Created output directory: {OUTPUT_DIR}") |
|
except Exception as e: |
|
logger.error(f"β Failed to create output directory: {str(e)}") |
|
|
|
@app.route("/", methods=["GET"]) |
|
def home(): |
|
return jsonify({"message": "Speech API is running", "status": "active"}) |
|
|
|
@app.route("/health", methods=["GET"]) |
|
def health_check(): |
|
|
|
translation_status = {} |
|
|
|
|
|
for lang_pair in ["pam-eng", "eng-pam", "tgl-eng", "eng-tgl"]: |
|
translation_status[lang_pair] = "loaded" if lang_pair in translation_models and translation_models[lang_pair] is not None else "failed" |
|
|
|
|
|
phi_status = "loaded" if "phi" in translation_models and translation_models["phi"] is not None else "failed" |
|
translation_status["pam-fil"] = phi_status |
|
translation_status["fil-pam"] = phi_status |
|
translation_status["pam-tgl"] = phi_status |
|
translation_status["tgl-pam"] = phi_status |
|
|
|
health_status = { |
|
"api_status": "online", |
|
"asr_model": "loaded" if asr_model is not None else "failed", |
|
"tts_models": {lang: "loaded" if model is not None else "failed" |
|
for lang, model in tts_models.items()}, |
|
"translation_models": translation_status, |
|
"device": device |
|
} |
|
return jsonify(health_status) |
|
|
|
@app.route("/asr", methods=["POST"]) |
|
def transcribe_audio(): |
|
if asr_model is None or asr_processor is None: |
|
logger.error("β ASR endpoint called but models aren't loaded") |
|
return jsonify({"error": "ASR model not available"}), 503 |
|
|
|
try: |
|
if "audio" not in request.files: |
|
logger.warning("β οΈ ASR request missing audio file") |
|
return jsonify({"error": "No audio file uploaded"}), 400 |
|
|
|
audio_file = request.files["audio"] |
|
language = request.form.get("language", "english").lower() |
|
|
|
if language not in LANGUAGE_CODES: |
|
logger.warning(f"β οΈ Unsupported language requested: {language}") |
|
return jsonify({"error": f"Unsupported language: {language}. Available: {list(LANGUAGE_CODES.keys())}"}), 400 |
|
|
|
lang_code = LANGUAGE_CODES[language] |
|
logger.info(f"π Processing {language} audio for ASR") |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(audio_file.filename)[-1]) as temp_audio: |
|
temp_audio.write(audio_file.read()) |
|
temp_audio_path = temp_audio.name |
|
logger.debug(f"π Temporary audio saved to {temp_audio_path}") |
|
|
|
|
|
wav_path = temp_audio_path |
|
if not audio_file.filename.lower().endswith(".wav"): |
|
wav_path = os.path.join(OUTPUT_DIR, "converted_audio.wav") |
|
logger.info(f"π Converting audio to WAV format: {wav_path}") |
|
try: |
|
audio = AudioSegment.from_file(temp_audio_path) |
|
audio = audio.set_frame_rate(SAMPLE_RATE).set_channels(1) |
|
audio.export(wav_path, format="wav") |
|
except Exception as e: |
|
logger.error(f"β Audio conversion failed: {str(e)}") |
|
return jsonify({"error": f"Audio conversion failed: {str(e)}"}), 500 |
|
|
|
|
|
try: |
|
waveform, sr = torchaudio.load(wav_path) |
|
logger.debug(f"β
Audio loaded: {wav_path} (Sample rate: {sr}Hz)") |
|
|
|
|
|
if sr != SAMPLE_RATE: |
|
logger.info(f"π Resampling audio from {sr}Hz to {SAMPLE_RATE}Hz") |
|
waveform = torchaudio.transforms.Resample(sr, SAMPLE_RATE)(waveform) |
|
|
|
waveform = waveform / torch.max(torch.abs(waveform)) |
|
except Exception as e: |
|
logger.error(f"β Failed to load or process audio: {str(e)}") |
|
return jsonify({"error": f"Audio processing failed: {str(e)}"}), 500 |
|
|
|
|
|
try: |
|
inputs = asr_processor( |
|
waveform.squeeze().numpy(), |
|
sampling_rate=SAMPLE_RATE, |
|
return_tensors="pt", |
|
language=lang_code |
|
) |
|
inputs = {k: v.to(device) for k, v in inputs.items()} |
|
except Exception as e: |
|
logger.error(f"β ASR preprocessing failed: {str(e)}") |
|
return jsonify({"error": f"ASR preprocessing failed: {str(e)}"}), 500 |
|
|
|
|
|
try: |
|
with torch.no_grad(): |
|
logits = asr_model(**inputs).logits |
|
ids = torch.argmax(logits, dim=-1)[0] |
|
transcription = asr_processor.decode(ids) |
|
|
|
logger.info(f"β
Transcription ({language}): {transcription}") |
|
|
|
|
|
try: |
|
os.unlink(temp_audio_path) |
|
if wav_path != temp_audio_path: |
|
os.unlink(wav_path) |
|
except Exception as e: |
|
logger.warning(f"β οΈ Failed to clean up temp files: {str(e)}") |
|
|
|
return jsonify({ |
|
"transcription": transcription, |
|
"language": language, |
|
"language_code": lang_code |
|
}) |
|
except Exception as e: |
|
logger.error(f"β ASR inference failed: {str(e)}") |
|
logger.debug(f"Stack trace: {traceback.format_exc()}") |
|
return jsonify({"error": f"ASR inference failed: {str(e)}"}), 500 |
|
|
|
except Exception as e: |
|
logger.error(f"β Unhandled exception in ASR endpoint: {str(e)}") |
|
logger.debug(f"Stack trace: {traceback.format_exc()}") |
|
return jsonify({"error": f"Internal server error: {str(e)}"}), 500 |
|
|
|
|
|
@app.route("/tts", methods=["POST"]) |
|
def generate_tts(): |
|
try: |
|
data = request.get_json() |
|
if not data: |
|
logger.warning("β οΈ TTS endpoint called with no JSON data") |
|
return jsonify({"error": "No JSON data provided"}), 400 |
|
|
|
text_input = data.get("text", "").strip() |
|
language = data.get("language", "kapampangan").lower() |
|
|
|
if not text_input: |
|
logger.warning("β οΈ TTS request with empty text") |
|
return jsonify({"error": "No text provided"}), 400 |
|
|
|
if language not in TTS_MODELS: |
|
logger.warning(f"β οΈ TTS requested for unsupported language: {language}") |
|
return jsonify({"error": f"Invalid language. Available options: {list(TTS_MODELS.keys())}"}), 400 |
|
|
|
if tts_models[language] is None: |
|
logger.error(f"β TTS model for {language} not loaded") |
|
return jsonify({"error": f"TTS model for {language} not available"}), 503 |
|
|
|
logger.info(f"π Generating TTS for language: {language}, text: '{text_input}'") |
|
|
|
try: |
|
processor = tts_processors[language] |
|
model = tts_models[language] |
|
inputs = processor(text_input, return_tensors="pt") |
|
inputs = {k: v.to(device) for k, v in inputs.items()} |
|
except Exception as e: |
|
logger.error(f"β TTS preprocessing failed: {str(e)}") |
|
return jsonify({"error": f"TTS preprocessing failed: {str(e)}"}), 500 |
|
|
|
|
|
try: |
|
with torch.no_grad(): |
|
output = model(**inputs).waveform |
|
waveform = output.squeeze().cpu().numpy() |
|
except Exception as e: |
|
logger.error(f"β TTS inference failed: {str(e)}") |
|
logger.debug(f"Stack trace: {traceback.format_exc()}") |
|
return jsonify({"error": f"TTS inference failed: {str(e)}"}), 500 |
|
|
|
|
|
try: |
|
output_filename = os.path.join(OUTPUT_DIR, f"{language}_output.wav") |
|
sampling_rate = model.config.sampling_rate |
|
sf.write(output_filename, waveform, sampling_rate) |
|
logger.info(f"β
Speech generated! File saved: {output_filename}") |
|
except Exception as e: |
|
logger.error(f"β Failed to save audio file: {str(e)}") |
|
return jsonify({"error": f"Failed to save audio file: {str(e)}"}), 500 |
|
|
|
return jsonify({ |
|
"message": "TTS audio generated", |
|
"file_url": f"/download/{os.path.basename(output_filename)}", |
|
"language": language, |
|
"text_length": len(text_input) |
|
}) |
|
except Exception as e: |
|
logger.error(f"β Unhandled exception in TTS endpoint: {str(e)}") |
|
logger.debug(f"Stack trace: {traceback.format_exc()}") |
|
return jsonify({"error": f"Internal server error: {str(e)}"}), 500 |
|
|
|
|
|
@app.route("/download/<filename>", methods=["GET"]) |
|
def download_audio(filename): |
|
file_path = os.path.join(OUTPUT_DIR, filename) |
|
if os.path.exists(file_path): |
|
logger.info(f"π€ Serving audio file: {file_path}") |
|
return send_file(file_path, mimetype="audio/wav", as_attachment=True) |
|
|
|
logger.warning(f"β οΈ Requested file not found: {file_path}") |
|
return jsonify({"error": "File not found"}), 404 |
|
|
|
@app.route("/translate", methods=["POST"]) |
|
def translate_text(): |
|
try: |
|
data = request.get_json() |
|
if not data: |
|
logger.warning("β οΈ Translation endpoint called with no JSON data") |
|
return jsonify({"error": "No JSON data provided"}), 400 |
|
|
|
source_text = data.get("text", "").strip() |
|
source_language = data.get("source_language", "").lower() |
|
target_language = data.get("target_language", "").lower() |
|
|
|
if not source_text: |
|
logger.warning("β οΈ Translation request with empty text") |
|
return jsonify({"error": "No text provided"}), 400 |
|
|
|
|
|
source_code = LANGUAGE_CODES.get(source_language, source_language) |
|
target_code = LANGUAGE_CODES.get(target_language, target_language) |
|
|
|
logger.info(f"π Translating from {source_language} to {target_language}: '{source_text}'") |
|
|
|
|
|
use_phi_model = False |
|
actual_source_code = source_code |
|
actual_target_code = target_code |
|
|
|
|
|
if (source_code == "pam" and target_code == "fil") or (source_code == "fil" and target_code == "pam"): |
|
use_phi_model = True |
|
elif (source_code == "pam" and target_code == "tgl"): |
|
use_phi_model = True |
|
actual_target_code = "fil" |
|
elif (source_code == "tgl" and target_code == "pam"): |
|
use_phi_model = True |
|
actual_source_code = "fil" |
|
|
|
if use_phi_model: |
|
model_key = "phi" |
|
|
|
|
|
if model_key not in translation_models or translation_models[model_key] is None: |
|
logger.error(f"β Translation model for {model_key} not loaded") |
|
return jsonify({"error": f"Translation model not available"}), 503 |
|
|
|
try: |
|
|
|
model = translation_models[model_key] |
|
tokenizer = translation_tokenizers[model_key] |
|
|
|
|
|
input_text = f">>{actual_target_code}<< {source_text}" |
|
|
|
logger.info(f"π Using phi model with input: '{input_text}'") |
|
|
|
|
|
tokenized = tokenizer(input_text, return_tensors="pt", padding=True) |
|
tokenized = {k: v.to(device) for k, v in tokenized.items()} |
|
|
|
|
|
with torch.no_grad(): |
|
translated = model.generate(**tokenized) |
|
|
|
|
|
result = tokenizer.decode(translated[0], skip_special_tokens=True) |
|
|
|
logger.info(f"β
Translation result: '{result}'") |
|
|
|
return jsonify({ |
|
"translated_text": result, |
|
"source_language": source_language, |
|
"target_language": target_language |
|
}) |
|
except Exception as e: |
|
logger.error(f"β Translation processing failed: {str(e)}") |
|
logger.debug(f"Stack trace: {traceback.format_exc()}") |
|
return jsonify({"error": f"Translation processing failed: {str(e)}"}), 500 |
|
else: |
|
|
|
lang_pair = f"{source_code}-{target_code}" |
|
|
|
|
|
if lang_pair not in translation_models: |
|
logger.warning(f"β οΈ No translation model available for {lang_pair}") |
|
return jsonify({"error": f"Translation from {source_language} to {target_language} is not supported yet"}), 400 |
|
|
|
if translation_models[lang_pair] is None or translation_tokenizers[lang_pair] is None: |
|
logger.error(f"β Translation model for {lang_pair} not loaded") |
|
return jsonify({"error": f"Translation model not available"}), 503 |
|
|
|
try: |
|
|
|
model = translation_models[lang_pair] |
|
tokenizer = translation_tokenizers[lang_pair] |
|
|
|
|
|
tokenized = tokenizer(source_text, return_tensors="pt", padding=True) |
|
tokenized = {k: v.to(device) for k, v in tokenized.items()} |
|
|
|
|
|
with torch.no_grad(): |
|
translated = model.generate(**tokenized) |
|
|
|
|
|
result = tokenizer.decode(translated[0], skip_special_tokens=True) |
|
|
|
logger.info(f"β
Translation result: '{result}'") |
|
|
|
return jsonify({ |
|
"translated_text": result, |
|
"source_language": source_language, |
|
"target_language": target_language |
|
}) |
|
except Exception as e: |
|
logger.error(f"β Translation processing failed: {str(e)}") |
|
logger.debug(f"Stack trace: {traceback.format_exc()}") |
|
return jsonify({"error": f"Translation processing failed: {str(e)}"}), 500 |
|
|
|
except Exception as e: |
|
logger.error(f"β Unhandled exception in translation endpoint: {str(e)}") |
|
logger.debug(f"Stack trace: {traceback.format_exc()}") |
|
return jsonify({"error": f"Internal server error: {str(e)}"}), 500 |
|
|
|
|
|
def calculate_similarity(text1, text2): |
|
"""Calculate text similarity percentage.""" |
|
def clean_text(text): |
|
return text.lower() |
|
|
|
clean1 = clean_text(text1) |
|
clean2 = clean_text(text2) |
|
|
|
matcher = SequenceMatcher(None, clean1, clean2) |
|
return matcher.ratio() * 100 |
|
|
|
|
|
@app.route("/evaluate", methods=["POST"]) |
|
def evaluate_pronunciation(): |
|
if asr_model is None or asr_processor is None: |
|
logger.error("β Evaluation endpoint called but ASR models aren't loaded") |
|
return jsonify({"error": "ASR model not available"}), 503 |
|
|
|
try: |
|
if "audio" not in request.files: |
|
logger.warning("β οΈ Evaluation request missing audio file") |
|
return jsonify({"error": "No audio file uploaded"}), 400 |
|
|
|
audio_file = request.files["audio"] |
|
reference_word = request.form.get("reference_word", "").strip() |
|
language = request.form.get("language", "tagalog").lower() |
|
|
|
|
|
reference_patterns = [ |
|
"mayap_a_abak", "mayap_a_ugtu", "mayap_a_gatpanapun", |
|
"mayap_a_bengi", "komusta_ka" |
|
] |
|
|
|
if not reference_word or reference_word not in reference_patterns: |
|
logger.warning(f"β οΈ Invalid reference word: {reference_word}") |
|
return jsonify({"error": f"Invalid reference word. Available: {reference_patterns}"}), 400 |
|
|
|
lang_code = LANGUAGE_CODES.get(language, language) |
|
logger.info(f"π Evaluating pronunciation of '{reference_word}' in {language}") |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio: |
|
temp_audio.write(audio_file.read()) |
|
user_audio_path = temp_audio.name |
|
logger.debug(f"π User audio saved to {user_audio_path}") |
|
|
|
|
|
try: |
|
|
|
user_waveform, sr = librosa.load(user_audio_path, sr=SAMPLE_RATE, mono=True) |
|
|
|
|
|
processed_path = os.path.join(OUTPUT_DIR, "processed_user_audio.wav") |
|
sf.write(processed_path, user_waveform, SAMPLE_RATE) |
|
logger.debug(f"π Processed user audio saved to {processed_path}") |
|
|
|
|
|
user_audio_path = processed_path |
|
except Exception as e: |
|
logger.error(f"β Audio processing failed: {str(e)}") |
|
return jsonify({"error": f"Audio processing failed: {str(e)}"}), 500 |
|
|
|
|
|
reference_dir = os.path.join(REFERENCE_AUDIO_DIR, reference_word) |
|
if not os.path.exists(reference_dir): |
|
logger.warning(f"β οΈ Reference directory not found: {reference_dir}") |
|
return jsonify({"error": f"Reference audio for {reference_word} not found"}), 404 |
|
|
|
reference_files = glob.glob(os.path.join(reference_dir, "*.wav")) |
|
if not reference_files: |
|
logger.warning(f"β οΈ No reference audio files found in {reference_dir}") |
|
return jsonify({"error": f"No reference audio found for {reference_word}"}), 404 |
|
|
|
logger.info(f"π Found {len(reference_files)} reference files for '{reference_word}'") |
|
|
|
|
|
try: |
|
|
|
inputs = asr_processor( |
|
user_waveform, |
|
sampling_rate=SAMPLE_RATE, |
|
return_tensors="pt", |
|
language=lang_code |
|
) |
|
inputs = {k: v.to(device) for k, v in inputs.items()} |
|
|
|
|
|
with torch.no_grad(): |
|
logits = asr_model(**inputs).logits |
|
ids = torch.argmax(logits, dim=-1)[0] |
|
user_transcription = asr_processor.decode(ids) |
|
|
|
logger.info(f"β
User transcription: {user_transcription}") |
|
except Exception as e: |
|
logger.error(f"β ASR inference failed: {str(e)}") |
|
return jsonify({"error": f"ASR inference failed: {str(e)}"}), 500 |
|
|
|
|
|
results = [] |
|
best_score = 0 |
|
best_reference = None |
|
|
|
for ref_file in reference_files: |
|
try: |
|
|
|
ref_waveform, _ = librosa.load(ref_file, sr=SAMPLE_RATE, mono=True) |
|
|
|
|
|
inputs = asr_processor( |
|
ref_waveform, |
|
sampling_rate=SAMPLE_RATE, |
|
return_tensors="pt", |
|
language=lang_code |
|
) |
|
inputs = {k: v.to(device) for k, v in inputs.items()} |
|
|
|
with torch.no_grad(): |
|
logits = asr_model(**inputs).logits |
|
ids = torch.argmax(logits, dim=-1)[0] |
|
ref_transcription = asr_processor.decode(ids) |
|
|
|
|
|
similarity = calculate_similarity(user_transcription, ref_transcription) |
|
|
|
results.append({ |
|
"reference_file": os.path.basename(ref_file), |
|
"reference_text": ref_transcription, |
|
"similarity_score": similarity |
|
}) |
|
|
|
if similarity > best_score: |
|
best_score = similarity |
|
best_reference = os.path.basename(ref_file) |
|
|
|
logger.debug(f"π Reference '{os.path.basename(ref_file)}': {similarity:.2f}%") |
|
except Exception as e: |
|
logger.error(f"β Error processing reference audio {ref_file}: {str(e)}") |
|
|
|
|
|
try: |
|
if os.path.exists(user_audio_path) and user_audio_path != processed_path: |
|
os.unlink(user_audio_path) |
|
except Exception as e: |
|
logger.warning(f"β οΈ Failed to clean up temp files: {str(e)}") |
|
|
|
|
|
is_correct = best_score >= 70.0 |
|
feedback = "Great pronunciation!" if is_correct else "Try again! Listen to the sample" |
|
|
|
return jsonify({ |
|
"is_correct": is_correct, |
|
"score": best_score, |
|
"feedback": feedback, |
|
"transcription": user_transcription, |
|
"reference_word": reference_word, |
|
"details": results |
|
}) |
|
|
|
except Exception as e: |
|
logger.error(f"β Unhandled exception in evaluation endpoint: {str(e)}") |
|
logger.debug(f"Stack trace: {traceback.format_exc()}") |
|
return jsonify({"error": f"Internal server error: {str(e)}"}), 500 |
|
|
|
@app.route("/upload_reference", methods=["POST"]) |
|
def upload_reference_audio(): |
|
try: |
|
if "audio" not in request.files: |
|
logger.warning("β οΈ Reference upload missing audio file") |
|
return jsonify({"error": "No audio file uploaded"}), 400 |
|
|
|
reference_word = request.form.get("reference_word", "").strip() |
|
if not reference_word: |
|
logger.warning("β οΈ Reference upload missing reference word") |
|
return jsonify({"error": "No reference word provided"}), 400 |
|
|
|
|
|
reference_patterns = [ |
|
"mayap_a_abak", "mayap_a_ugtu", "mayap_a_gatpanapun", |
|
"mayap_a_bengi", "komusta_ka" |
|
] |
|
|
|
if reference_word not in reference_patterns: |
|
logger.warning(f"β οΈ Invalid reference word: {reference_word}") |
|
return jsonify({"error": f"Invalid reference word. Available: {reference_patterns}"}), 400 |
|
|
|
|
|
pattern_dir = os.path.join(REFERENCE_AUDIO_DIR, reference_word) |
|
os.makedirs(pattern_dir, exist_ok=True) |
|
|
|
|
|
audio_file = request.files["audio"] |
|
file_path = os.path.join(pattern_dir, secure_filename(audio_file.filename)) |
|
audio_file.save(file_path) |
|
|
|
|
|
if not file_path.lower().endswith('.wav'): |
|
base_path = os.path.splitext(file_path)[0] |
|
wav_path = f"{base_path}.wav" |
|
try: |
|
audio = AudioSegment.from_file(file_path) |
|
audio = audio.set_frame_rate(SAMPLE_RATE).set_channels(1) |
|
audio.export(wav_path, format="wav") |
|
|
|
os.unlink(file_path) |
|
file_path = wav_path |
|
except Exception as e: |
|
logger.error(f"β Reference audio conversion failed: {str(e)}") |
|
return jsonify({"error": f"Audio conversion failed: {str(e)}"}), 500 |
|
|
|
logger.info(f"β
Reference audio saved successfully for {reference_word}: {file_path}") |
|
|
|
|
|
references = glob.glob(os.path.join(pattern_dir, "*.wav")) |
|
return jsonify({ |
|
"message": "Reference audio uploaded successfully", |
|
"reference_word": reference_word, |
|
"file": os.path.basename(file_path), |
|
"total_references": len(references) |
|
}) |
|
|
|
except Exception as e: |
|
logger.error(f"β Unhandled exception in reference upload: {str(e)}") |
|
logger.debug(f"Stack trace: {traceback.format_exc()}") |
|
return jsonify({"error": f"Internal server error: {str(e)}"}), 500 |
|
|
|
|
|
@app.before_first_request |
|
def setup_reference_audio(): |
|
try: |
|
os.makedirs(REFERENCE_AUDIO_DIR, exist_ok=True) |
|
logger.info(f"π Created reference audio directory: {REFERENCE_AUDIO_DIR}") |
|
|
|
|
|
except Exception as e: |
|
logger.error(f"β Failed to set up reference audio directory: {str(e)}") |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
logger.info("π Starting Speech API server") |
|
logger.info(f"π System status: ASR model: {'β
' if asr_model else 'β'}") |
|
for lang, model in tts_models.items(): |
|
logger.info(f"π TTS model {lang}: {'β
' if model else 'β'}") |
|
|
|
app.run(host="0.0.0.0", port=7860, debug=True) |