# Set cache directories first, before other imports import os import sys import logging import traceback # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logger = logging.getLogger("speech_api") # Set all cache directories to locations within /tmp cache_dirs = { "HF_HOME": "/tmp/hf_home", "TRANSFORMERS_CACHE": "/tmp/transformers_cache", "HUGGINGFACE_HUB_CACHE": "/tmp/huggingface_hub_cache", "TORCH_HOME": "/tmp/torch_home", "XDG_CACHE_HOME": "/tmp/xdg_cache" } # Set environment variables and create directories for env_var, path in cache_dirs.items(): os.environ[env_var] = path try: os.makedirs(path, exist_ok=True) logger.info(f"📁 Created cache directory: {path}") except Exception as e: logger.error(f"❌ Failed to create directory {path}: {str(e)}") # Now import the rest of the libraries try: import torch from pydub import AudioSegment import tempfile import torchaudio import soundfile as sf from flask import Flask, request, jsonify, send_file from flask_cors import CORS from transformers import Wav2Vec2ForCTC, AutoProcessor, VitsModel, AutoTokenizer from transformers import MarianMTModel, MarianTokenizer logger.info("✅ All required libraries imported successfully") except ImportError as e: logger.critical(f"❌ Failed to import necessary libraries: {str(e)}") sys.exit(1) # Check CUDA availability if torch.cuda.is_available(): logger.info(f"🚀 CUDA available: {torch.cuda.get_device_name(0)}") device = "cuda" else: logger.info("⚠️ CUDA not available, using CPU") device = "cpu" app = Flask(__name__) CORS(app) # ASR Model ASR_MODEL_ID = "Coco-18/mms-asr-tgl-en-safetensor" logger.info(f"🔄 Loading ASR model: {ASR_MODEL_ID}") asr_processor = None asr_model = None try: asr_processor = AutoProcessor.from_pretrained( ASR_MODEL_ID, cache_dir=cache_dirs["TRANSFORMERS_CACHE"] ) logger.info("✅ ASR processor loaded successfully") asr_model = Wav2Vec2ForCTC.from_pretrained( ASR_MODEL_ID, cache_dir=cache_dirs["TRANSFORMERS_CACHE"] ) asr_model.to(device) logger.info(f"✅ ASR model loaded successfully on {device}") except Exception as e: logger.error(f"❌ Error loading ASR model: {str(e)}") logger.debug(f"Stack trace: {traceback.format_exc()}") logger.debug(f"Python version: {sys.version}") logger.debug(f"Current working directory: {os.getcwd()}") logger.debug(f"Temp directory exists: {os.path.exists('/tmp')}") logger.debug(f"Temp directory writeable: {os.access('/tmp', os.W_OK)}") # Language-specific configurations LANGUAGE_CODES = { "kapampangan": "pam", "filipino": "fil", # Replaced tagalog with filipino "english": "eng", "tagalog": "tgl", } # TTS Models (Kapampangan, Tagalog, English) TTS_MODELS = { "kapampangan": "facebook/mms-tts-pam", "tagalog": "facebook/mms-tts-tgl", "english": "facebook/mms-tts-eng" } tts_models = {} tts_processors = {} for lang, model_id in TTS_MODELS.items(): logger.info(f"🔄 Loading TTS model for {lang}: {model_id}") try: tts_processors[lang] = AutoTokenizer.from_pretrained( model_id, cache_dir=cache_dirs["TRANSFORMERS_CACHE"] ) logger.info(f"✅ {lang} TTS processor loaded") tts_models[lang] = VitsModel.from_pretrained( model_id, cache_dir=cache_dirs["TRANSFORMERS_CACHE"] ) tts_models[lang].to(device) logger.info(f"✅ {lang} TTS model loaded on {device}") except Exception as e: logger.error(f"❌ Failed to load {lang} TTS model: {str(e)}") logger.debug(f"Stack trace: {traceback.format_exc()}") tts_models[lang] = None # Replace the single translation model with a dictionary of models TRANSLATION_MODELS = { "pam-eng": "Coco-18/opus-mt-pam-en", "eng-pam": "Coco-18/opus-mt-en-pam", "tgl-eng": "Helsinki-NLP/opus-mt-tl-en", "eng-tgl": "Helsinki-NLP/opus-mt-en-tl", "phi": "Coco-18/opus-mt-phi" } logger.info(f"🔄 Loading Translation model: {TRANSLATION_MODELS}") # Initialize translation models and tokenizers translation_models = {} translation_tokenizers = {} for model_key, model_id in TRANSLATION_MODELS.items(): logger.info(f"🔄 Loading Translation model: {model_id}") try: translation_tokenizers[model_key] = MarianTokenizer.from_pretrained( model_id, cache_dir=cache_dirs["TRANSFORMERS_CACHE"] ) logger.info(f"✅ Translation tokenizer loaded successfully for {model_key}") translation_models[model_key] = MarianMTModel.from_pretrained( model_id, cache_dir=cache_dirs["TRANSFORMERS_CACHE"] ) translation_models[model_key].to(device) logger.info(f"✅ Translation model loaded successfully on {device} for {model_key}") except Exception as e: logger.error(f"❌ Error loading Translation model for {model_key}: {str(e)}") logger.debug(f"Stack trace: {traceback.format_exc()}") translation_models[model_key] = None translation_tokenizers[model_key] = None # Constants SAMPLE_RATE = 16000 OUTPUT_DIR = "/tmp/audio_outputs" try: os.makedirs(OUTPUT_DIR, exist_ok=True) logger.info(f"📁 Created output directory: {OUTPUT_DIR}") except Exception as e: logger.error(f"❌ Failed to create output directory: {str(e)}") @app.route("/", methods=["GET"]) def home(): return jsonify({"message": "Speech API is running", "status": "active"}) @app.route("/health", methods=["GET"]) def health_check(): # Initialize direct language pair statuses based on loaded models translation_status = {} # Add status for direct model pairs for lang_pair in ["pam-eng", "eng-pam", "tgl-eng", "eng-tgl"]: translation_status[lang_pair] = "loaded" if lang_pair in translation_models and translation_models[lang_pair] is not None else "failed" # Add special phi model status phi_status = "loaded" if "phi" in translation_models and translation_models["phi"] is not None else "failed" translation_status["pam-fil"] = phi_status translation_status["fil-pam"] = phi_status translation_status["pam-tgl"] = phi_status # Using phi model but replacing tgl with fil translation_status["tgl-pam"] = phi_status # Using phi model but replacing tgl with fil health_status = { "api_status": "online", "asr_model": "loaded" if asr_model is not None else "failed", "tts_models": {lang: "loaded" if model is not None else "failed" for lang, model in tts_models.items()}, "translation_models": translation_status, "device": device } return jsonify(health_status) @app.route("/asr", methods=["POST"]) def transcribe_audio(): if asr_model is None or asr_processor is None: logger.error("❌ ASR endpoint called but models aren't loaded") return jsonify({"error": "ASR model not available"}), 503 try: if "audio" not in request.files: logger.warning("⚠️ ASR request missing audio file") return jsonify({"error": "No audio file uploaded"}), 400 audio_file = request.files["audio"] language = request.form.get("language", "english").lower() if language not in LANGUAGE_CODES: logger.warning(f"⚠️ Unsupported language requested: {language}") return jsonify({"error": f"Unsupported language: {language}. Available: {list(LANGUAGE_CODES.keys())}"}), 400 lang_code = LANGUAGE_CODES[language] logger.info(f"🔄 Processing {language} audio for ASR") # Save the uploaded file temporarily with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(audio_file.filename)[-1]) as temp_audio: temp_audio.write(audio_file.read()) temp_audio_path = temp_audio.name logger.debug(f"📁 Temporary audio saved to {temp_audio_path}") # Convert to WAV if necessary wav_path = temp_audio_path if not audio_file.filename.lower().endswith(".wav"): wav_path = os.path.join(OUTPUT_DIR, "converted_audio.wav") logger.info(f"🔄 Converting audio to WAV format: {wav_path}") try: audio = AudioSegment.from_file(temp_audio_path) audio = audio.set_frame_rate(SAMPLE_RATE).set_channels(1) audio.export(wav_path, format="wav") except Exception as e: logger.error(f"❌ Audio conversion failed: {str(e)}") return jsonify({"error": f"Audio conversion failed: {str(e)}"}), 500 # Load and process the WAV file try: waveform, sr = torchaudio.load(wav_path) logger.debug(f"✅ Audio loaded: {wav_path} (Sample rate: {sr}Hz)") # Resample if needed if sr != SAMPLE_RATE: logger.info(f"🔄 Resampling audio from {sr}Hz to {SAMPLE_RATE}Hz") waveform = torchaudio.transforms.Resample(sr, SAMPLE_RATE)(waveform) waveform = waveform / torch.max(torch.abs(waveform)) except Exception as e: logger.error(f"❌ Failed to load or process audio: {str(e)}") return jsonify({"error": f"Audio processing failed: {str(e)}"}), 500 # Process audio for ASR try: inputs = asr_processor( waveform.squeeze().numpy(), sampling_rate=SAMPLE_RATE, return_tensors="pt", language=lang_code ) inputs = {k: v.to(device) for k, v in inputs.items()} except Exception as e: logger.error(f"❌ ASR preprocessing failed: {str(e)}") return jsonify({"error": f"ASR preprocessing failed: {str(e)}"}), 500 # Perform ASR try: with torch.no_grad(): logits = asr_model(**inputs).logits ids = torch.argmax(logits, dim=-1)[0] transcription = asr_processor.decode(ids) logger.info(f"✅ Transcription ({language}): {transcription}") # Clean up temp files try: os.unlink(temp_audio_path) if wav_path != temp_audio_path: os.unlink(wav_path) except Exception as e: logger.warning(f"⚠️ Failed to clean up temp files: {str(e)}") return jsonify({ "transcription": transcription, "language": language, "language_code": lang_code }) except Exception as e: logger.error(f"❌ ASR inference failed: {str(e)}") logger.debug(f"Stack trace: {traceback.format_exc()}") return jsonify({"error": f"ASR inference failed: {str(e)}"}), 500 except Exception as e: logger.error(f"❌ Unhandled exception in ASR endpoint: {str(e)}") logger.debug(f"Stack trace: {traceback.format_exc()}") return jsonify({"error": f"Internal server error: {str(e)}"}), 500 @app.route("/tts", methods=["POST"]) def generate_tts(): try: data = request.get_json() if not data: logger.warning("⚠️ TTS endpoint called with no JSON data") return jsonify({"error": "No JSON data provided"}), 400 text_input = data.get("text", "").strip() language = data.get("language", "kapampangan").lower() if not text_input: logger.warning("⚠️ TTS request with empty text") return jsonify({"error": "No text provided"}), 400 if language not in TTS_MODELS: logger.warning(f"⚠️ TTS requested for unsupported language: {language}") return jsonify({"error": f"Invalid language. Available options: {list(TTS_MODELS.keys())}"}), 400 if tts_models[language] is None: logger.error(f"❌ TTS model for {language} not loaded") return jsonify({"error": f"TTS model for {language} not available"}), 503 logger.info(f"🔄 Generating TTS for language: {language}, text: '{text_input}'") try: processor = tts_processors[language] model = tts_models[language] inputs = processor(text_input, return_tensors="pt") inputs = {k: v.to(device) for k, v in inputs.items()} except Exception as e: logger.error(f"❌ TTS preprocessing failed: {str(e)}") return jsonify({"error": f"TTS preprocessing failed: {str(e)}"}), 500 # Generate speech try: with torch.no_grad(): output = model(**inputs).waveform waveform = output.squeeze().cpu().numpy() except Exception as e: logger.error(f"❌ TTS inference failed: {str(e)}") logger.debug(f"Stack trace: {traceback.format_exc()}") return jsonify({"error": f"TTS inference failed: {str(e)}"}), 500 # Save to file try: output_filename = os.path.join(OUTPUT_DIR, f"{language}_output.wav") sampling_rate = model.config.sampling_rate sf.write(output_filename, waveform, sampling_rate) logger.info(f"✅ Speech generated! File saved: {output_filename}") except Exception as e: logger.error(f"❌ Failed to save audio file: {str(e)}") return jsonify({"error": f"Failed to save audio file: {str(e)}"}), 500 return jsonify({ "message": "TTS audio generated", "file_url": f"/download/{os.path.basename(output_filename)}", "language": language, "text_length": len(text_input) }) except Exception as e: logger.error(f"❌ Unhandled exception in TTS endpoint: {str(e)}") logger.debug(f"Stack trace: {traceback.format_exc()}") return jsonify({"error": f"Internal server error: {str(e)}"}), 500 @app.route("/download/", methods=["GET"]) def download_audio(filename): file_path = os.path.join(OUTPUT_DIR, filename) if os.path.exists(file_path): logger.info(f"📤 Serving audio file: {file_path}") return send_file(file_path, mimetype="audio/wav", as_attachment=True) logger.warning(f"⚠️ Requested file not found: {file_path}") return jsonify({"error": "File not found"}), 404 @app.route("/translate", methods=["POST"]) def translate_text(): try: data = request.get_json() if not data: logger.warning("⚠️ Translation endpoint called with no JSON data") return jsonify({"error": "No JSON data provided"}), 400 source_text = data.get("text", "").strip() source_language = data.get("source_language", "").lower() target_language = data.get("target_language", "").lower() if not source_text: logger.warning("⚠️ Translation request with empty text") return jsonify({"error": "No text provided"}), 400 # Map language names to codes source_code = LANGUAGE_CODES.get(source_language, source_language) target_code = LANGUAGE_CODES.get(target_language, target_language) logger.info(f"🔄 Translating from {source_language} to {target_language}: '{source_text}'") # Special handling for pam-fil, fil-pam, pam-tgl and tgl-pam using the phi model use_phi_model = False actual_source_code = source_code actual_target_code = target_code # Check if we need to use the phi model with fil replacement if (source_code == "pam" and target_code == "fil") or (source_code == "fil" and target_code == "pam"): use_phi_model = True elif (source_code == "pam" and target_code == "tgl"): use_phi_model = True actual_target_code = "fil" # Replace tgl with fil for the phi model elif (source_code == "tgl" and target_code == "pam"): use_phi_model = True actual_source_code = "fil" # Replace tgl with fil for the phi model if use_phi_model: model_key = "phi" # Check if we have the phi model if model_key not in translation_models or translation_models[model_key] is None: logger.error(f"❌ Translation model for {model_key} not loaded") return jsonify({"error": f"Translation model not available"}), 503 try: # Get the phi model and tokenizer model = translation_models[model_key] tokenizer = translation_tokenizers[model_key] # Prepend target language token to input input_text = f">>{actual_target_code}<< {source_text}" logger.info(f"🔄 Using phi model with input: '{input_text}'") # Tokenize the text tokenized = tokenizer(input_text, return_tensors="pt", padding=True) tokenized = {k: v.to(device) for k, v in tokenized.items()} # Generate translation with torch.no_grad(): translated = model.generate(**tokenized) # Decode the translation result = tokenizer.decode(translated[0], skip_special_tokens=True) logger.info(f"✅ Translation result: '{result}'") return jsonify({ "translated_text": result, "source_language": source_language, "target_language": target_language }) except Exception as e: logger.error(f"❌ Translation processing failed: {str(e)}") logger.debug(f"Stack trace: {traceback.format_exc()}") return jsonify({"error": f"Translation processing failed: {str(e)}"}), 500 else: # Create the regular language pair key for other language pairs lang_pair = f"{source_code}-{target_code}" # Check if we have a model for this language pair if lang_pair not in translation_models: logger.warning(f"⚠️ No translation model available for {lang_pair}") return jsonify({"error": f"Translation from {source_language} to {target_language} is not supported yet"}), 400 if translation_models[lang_pair] is None or translation_tokenizers[lang_pair] is None: logger.error(f"❌ Translation model for {lang_pair} not loaded") return jsonify({"error": f"Translation model not available"}), 503 try: # Regular translation process for other language pairs model = translation_models[lang_pair] tokenizer = translation_tokenizers[lang_pair] # Tokenize the text tokenized = tokenizer(source_text, return_tensors="pt", padding=True) tokenized = {k: v.to(device) for k, v in tokenized.items()} # Generate translation with torch.no_grad(): translated = model.generate(**tokenized) # Decode the translation result = tokenizer.decode(translated[0], skip_special_tokens=True) logger.info(f"✅ Translation result: '{result}'") return jsonify({ "translated_text": result, "source_language": source_language, "target_language": target_language }) except Exception as e: logger.error(f"❌ Translation processing failed: {str(e)}") logger.debug(f"Stack trace: {traceback.format_exc()}") return jsonify({"error": f"Translation processing failed: {str(e)}"}), 500 except Exception as e: logger.error(f"❌ Unhandled exception in translation endpoint: {str(e)}") logger.debug(f"Stack trace: {traceback.format_exc()}") return jsonify({"error": f"Internal server error: {str(e)}"}), 500 if __name__ == "__main__": logger.info("🚀 Starting Speech API server") logger.info(f"📊 System status: ASR model: {'✅' if asr_model else '❌'}") for lang, model in tts_models.items(): logger.info(f"📊 TTS model {lang}: {'✅' if model else '❌'}") app.run(host="0.0.0.0", port=7860, debug=True)