# app.py (for MeloTTS API Space) import gradio as gr import os import torch import io import soundfile as sf import base64 import logging # --- Setup --- # This command is crucial and needs to run once. # It downloads the dictionary needed for Japanese/Korean. os.system('python -m unidic download') from melo.api import TTS # --- Logging --- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # --- Model Loading --- MODEL = None DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' LANGUAGE = 'KR' # We will use 0.9 for a slightly faster than normal, clear pace. SPEED = 0.9 try: logger.info(f"Loading MeloTTS model for language: {LANGUAGE} on device: {DEVICE}") MODEL = TTS(language=LANGUAGE, device=DEVICE) SPEAKER_ID = 'KR' # For Korean, the main speaker is just 'KR' logger.info("MeloTTS model loaded successfully.") logger.info(f"Default speaker: {SPEAKER_ID}, Default speed: {SPEED}") except Exception as e: logger.exception(f"FATAL: MeloTTS model initialization error: {e}") MODEL = None # --- Main TTS Synthesis Function --- def synthesize(text_to_synthesize): if not MODEL: raise gr.Error("TTS Model is not loaded. Cannot process request.") if not text_to_synthesize or not text_to_synthesize.strip(): # Create and return a silent audio data URI silent_audio = np.zeros(int(0.1 * 24000), dtype=np.int16) # Melo uses 24kHz wav_buffer = io.BytesIO() sf.write(wav_buffer, silent_audio, 24000, format='WAV') wav_buffer.seek(0) wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8') return f"data:audio/wav;base64,{wav_base64}" try: logger.info(f"Synthesizing for text: '{text_to_synthesize[:80]}...'") # Use an in-memory BytesIO object to store the audio wav_buffer = io.BytesIO() MODEL.tts_to_file(text_to_synthesize, SPEAKER_ID, wav_buffer, speed=SPEED, format='wav') # Reset buffer position to the beginning wav_buffer.seek(0) # Read the bytes and encode to base64 wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8') logger.info("Synthesis complete.") return f"data:audio/wav;base64,{wav_base64}" except Exception as e: logger.exception(f"MeloTTS synthesis error: {e}") raise gr.Error(f"An error occurred during synthesis: {str(e)}") # --- Create and Launch the Gradio Interface --- # We create a pure API without a complex UI. iface = gr.Interface( fn=synthesize, inputs=gr.Textbox(label="Text to Synthesize"), outputs=gr.Textbox(label="Base64 Audio Output"), # Output is a text string for the API title="MeloTTS API for Korean", description="A simplified API for MeloTTS, configured for Korean language.", api_name="synthesize" ) # The .queue() is important for handling multiple requests on HF Spaces. iface.queue().launch()