# app.py (for your new MeloTTS space) import gradio as gr import torch import io import os import numpy as np import soundfile as sf import base64 import logging # This command is important and should run at the start os.system('python -m unidic download') from melo.api import TTS # --- Setup Logging --- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # --- Configuration --- # We pre-configure everything here. LANGUAGE = 'KR' # NOTE: A speed of 0.1 is extremely slow. 0.8 is a good starting point. Adjust if needed. SPEED = 0.8 DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' SPEAKER_ID = 'KR' # Default Korean speaker # --- Load Model (this happens only once when the space starts) --- MODEL_INSTANCE = None try: logger.info(f"Loading MeloTTS model for language: {LANGUAGE} on device: {DEVICE}...") MODEL_INSTANCE = TTS(language=LANGUAGE, device=DEVICE) logger.info("MeloTTS model loaded successfully.") except Exception as e: logger.exception(f"FATAL: MeloTTS model initialization error: {e}") MODEL_INSTANCE = None def synthesize(text_to_synthesize): """ Takes text input and returns a base64 encoded WAV audio data URI string. """ if not MODEL_INSTANCE: raise gr.Error("TTS Model is not available. Cannot process request.") if not text_to_synthesize or not text_to_synthesize.strip(): # Create and return a silent audio data URI for empty input silent_audio = np.zeros(int(0.1 * 24000), dtype=np.int16) wav_buffer = io.BytesIO() sf.write(wav_buffer, silent_audio, 24000, format='WAV') wav_buffer.seek(0) wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8') return f"data:audio/wav;base64,{wav_base64}" try: logger.info(f"Synthesizing for text: '{text_to_synthesize[:80]}...'") # Use an in-memory BytesIO object to hold the audio data wav_buffer = io.BytesIO() # Synthesize audio directly to the buffer MODEL_INSTANCE.tts_to_file( text_to_synthesize, MODEL_INSTANCE.hps.data.spk2id[SPEAKER_ID], wav_buffer, speed=SPEED, format='wav' ) # Reset buffer position to the beginning wav_buffer.seek(0) # Encode the bytes to base64 wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8') logger.info("Synthesis complete.") # Return the data URI string our React app expects return f"data:audio/wav;base64,{wav_base64}" except Exception as e: logger.exception(f"TTS synthesis error: {e}") raise gr.Error(f"An error occurred during synthesis: {str(e)}") # --- Create and Launch the Gradio Interface --- # We create a pure API with no complex UI. This is fast and reliable. iface = gr.Interface( fn=synthesize, inputs=gr.Textbox(label="Text to Synthesize"), outputs="text", # The API will return a simple text string (our base64 URI) title="MeloTTS API", description="A simplified API for MeloTTS. Pre-configured for Korean at 0.8 speed.", api_name="synthesize" ) # The .queue() helps manage traffic and is recommended for public APIs. iface.queue().launch()