Spaces:

minhhungg
/

melotts-api

Running

File size: 3,350 Bytes

# app.py (for your new MeloTTS space)

import gradio as gr
import torch
import io
import os
import numpy as np
import soundfile as sf
import base64
import logging

# This command is important and should run at the start
os.system('python -m unidic download')

from melo.api import TTS

# --- Setup Logging ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- Configuration ---
# We pre-configure everything here.
LANGUAGE = 'KR'
# NOTE: A speed of 0.1 is extremely slow. 0.8 is a good starting point. Adjust if needed.
SPEED = 0.8 
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
SPEAKER_ID = 'KR' # Default Korean speaker

# --- Load Model (this happens only once when the space starts) ---
MODEL_INSTANCE = None
try:
    logger.info(f"Loading MeloTTS model for language: {LANGUAGE} on device: {DEVICE}...")
    MODEL_INSTANCE = TTS(language=LANGUAGE, device=DEVICE)
    logger.info("MeloTTS model loaded successfully.")
except Exception as e:
    logger.exception(f"FATAL: MeloTTS model initialization error: {e}")
    MODEL_INSTANCE = None

def synthesize(text_to_synthesize):
    """
    Takes text input and returns a base64 encoded WAV audio data URI string.
    """
    if not MODEL_INSTANCE:
        raise gr.Error("TTS Model is not available. Cannot process request.")

    if not text_to_synthesize or not text_to_synthesize.strip():
        # Create and return a silent audio data URI for empty input
        silent_audio = np.zeros(int(0.1 * 24000), dtype=np.int16)
        wav_buffer = io.BytesIO()
        sf.write(wav_buffer, silent_audio, 24000, format='WAV')
        wav_buffer.seek(0)
        wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8')
        return f"data:audio/wav;base64,{wav_base64}"

    try:
        logger.info(f"Synthesizing for text: '{text_to_synthesize[:80]}...'")
        
        # Use an in-memory BytesIO object to hold the audio data
        wav_buffer = io.BytesIO()
        
        # Synthesize audio directly to the buffer
        MODEL_INSTANCE.tts_to_file(
            text_to_synthesize, 
            MODEL_INSTANCE.hps.data.spk2id[SPEAKER_ID], 
            wav_buffer, 
            speed=SPEED,
            format='wav'
        )
        
        # Reset buffer position to the beginning
        wav_buffer.seek(0)
        
        # Encode the bytes to base64
        wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8')
        
        logger.info("Synthesis complete.")
        
        # Return the data URI string our React app expects
        return f"data:audio/wav;base64,{wav_base64}"

    except Exception as e:
        logger.exception(f"TTS synthesis error: {e}")
        raise gr.Error(f"An error occurred during synthesis: {str(e)}")

# --- Create and Launch the Gradio Interface ---
# We create a pure API with no complex UI. This is fast and reliable.
iface = gr.Interface(
    fn=synthesize,
    inputs=gr.Textbox(label="Text to Synthesize"),
    outputs="text", # The API will return a simple text string (our base64 URI)
    title="MeloTTS API",
    description="A simplified API for MeloTTS. Pre-configured for Korean at 0.8 speed.",
    api_name="synthesize"
)

# The .queue() helps manage traffic and is recommended for public APIs.
iface.queue().launch()