import os import torch import torchaudio import soundfile as sf from flask import Flask, request, jsonify, send_file from flask_cors import CORS from transformers import VitsModel, AutoTokenizer # Set ALL cache directories to /tmp (writable in Hugging Face Spaces) os.environ["HF_HOME"] = "/tmp/hf_home" os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache" os.environ["HUGGINGFACE_HUB_CACHE"] = "/tmp/huggingface_cache" os.environ["TORCH_HOME"] = "/tmp/torch_home" app = Flask(__name__) CORS(app) # Allow external requests # Model paths for different languages (Hugging Face Hub) MODELS = { "kapampangan": "facebook/mms-tts-pam", "tagalog": "facebook/mms-tts-tgl", "english": "facebook/mms-tts-eng" } loaded_models = {} loaded_processors = {} for lang, path in MODELS.items(): try: print(f"🔄 Loading {lang} model: {path}...") # Force models to save in /tmp loaded_models[lang] = VitsModel.from_pretrained(path, cache_dir="/tmp/huggingface_cache") loaded_processors[lang] = AutoTokenizer.from_pretrained(path, cache_dir="/tmp/huggingface_cache") print(f"✅ {lang.capitalize()} model loaded successfully!") except Exception as e: print(f"❌ Error loading {lang} model: {str(e)}") loaded_models[lang] = None # Mark as unavailable loaded_processors[lang] = None # Constants SAMPLE_RATE = 16000 OUTPUT_DIR = "/tmp/" os.makedirs(OUTPUT_DIR, exist_ok=True) @app.route("/", methods=["GET"]) def home(): """Root route to check if the API is running""" return jsonify({"message": "TTS API is running. Use /tts to generate speech."}) @app.route("/tts", methods=["POST"]) def generate_tts(): """API endpoint to generate TTS audio""" try: # Get request data data = request.get_json() text_input = data.get("text", "").strip() language = data.get("language", "kapampangan").lower() # Validate inputs if language not in MODELS: return jsonify({"error": "Invalid language. Choose 'kapampangan', 'tagalog', or 'english'."}), 400 if not text_input: return jsonify({"error": "No text provided"}), 400 if loaded_models[language] is None: return jsonify({"error": f"Model for {language} failed to load"}), 500 print(f"🔄 Generating speech for '{text_input}' in {language}...") # Process text input processor = loaded_processors[language] model = loaded_models[language] inputs = processor(text_input, return_tensors="pt") # Generate speech with torch.no_grad(): output = model.generate(**inputs) # For VITS models, the output is typically a waveform # Check if output is a tuple/list or a single tensor if isinstance(output, tuple) or isinstance(output, list): waveform = output[0].cpu().numpy().squeeze() else: waveform = output.cpu().numpy().squeeze() # Save to file output_filename = os.path.join(OUTPUT_DIR, f"{language}_output.wav") sf.write(output_filename, waveform, SAMPLE_RATE) print(f"✅ Speech generated! File saved: {output_filename}") return jsonify({ "message": "TTS audio generated", "file_url": f"/download/{language}_output.wav" }) except Exception as e: print(f"❌ Error generating TTS: {e}") return jsonify({"error": f"Internal server error: {str(e)}"}), 500 @app.route("/download/", methods=["GET"]) def download_audio(filename): """Serve generated audio files""" file_path = os.path.join(OUTPUT_DIR, filename) if os.path.exists(file_path): return send_file(file_path, mimetype="audio/wav", as_attachment=True) return jsonify({"error": "File not found"}), 404 if __name__ == "__main__": app.run(host="0.0.0.0", port=7860, debug=True)