Spaces:
Running
Running
# app.py (for MeloTTS API Space) | |
import gradio as gr | |
import os | |
import torch | |
import io | |
import soundfile as sf | |
import base64 | |
import logging | |
# --- Setup --- | |
# This command is crucial and needs to run once. | |
# It downloads the dictionary needed for Japanese/Korean. | |
os.system('python -m unidic download') | |
from melo.api import TTS | |
# --- Logging --- | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
# --- Model Loading --- | |
MODEL = None | |
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' | |
LANGUAGE = 'KR' | |
# We will use 0.9 for a slightly faster than normal, clear pace. | |
SPEED = 0.9 | |
try: | |
logger.info(f"Loading MeloTTS model for language: {LANGUAGE} on device: {DEVICE}") | |
MODEL = TTS(language=LANGUAGE, device=DEVICE) | |
SPEAKER_ID = 'KR' # For Korean, the main speaker is just 'KR' | |
logger.info("MeloTTS model loaded successfully.") | |
logger.info(f"Default speaker: {SPEAKER_ID}, Default speed: {SPEED}") | |
except Exception as e: | |
logger.exception(f"FATAL: MeloTTS model initialization error: {e}") | |
MODEL = None | |
# --- Main TTS Synthesis Function --- | |
def synthesize(text_to_synthesize): | |
if not MODEL: | |
raise gr.Error("TTS Model is not loaded. Cannot process request.") | |
if not text_to_synthesize or not text_to_synthesize.strip(): | |
# Create and return a silent audio data URI | |
silent_audio = np.zeros(int(0.1 * 24000), dtype=np.int16) # Melo uses 24kHz | |
wav_buffer = io.BytesIO() | |
sf.write(wav_buffer, silent_audio, 24000, format='WAV') | |
wav_buffer.seek(0) | |
wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8') | |
return f"data:audio/wav;base64,{wav_base64}" | |
try: | |
logger.info(f"Synthesizing for text: '{text_to_synthesize[:80]}...'") | |
# Use an in-memory BytesIO object to store the audio | |
wav_buffer = io.BytesIO() | |
MODEL.tts_to_file(text_to_synthesize, SPEAKER_ID, wav_buffer, speed=SPEED, format='wav') | |
# Reset buffer position to the beginning | |
wav_buffer.seek(0) | |
# Read the bytes and encode to base64 | |
wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8') | |
logger.info("Synthesis complete.") | |
return f"data:audio/wav;base64,{wav_base64}" | |
except Exception as e: | |
logger.exception(f"MeloTTS synthesis error: {e}") | |
raise gr.Error(f"An error occurred during synthesis: {str(e)}") | |
# --- Create and Launch the Gradio Interface --- | |
# We create a pure API without a complex UI. | |
iface = gr.Interface( | |
fn=synthesize, | |
inputs=gr.Textbox(label="Text to Synthesize"), | |
outputs=gr.Textbox(label="Base64 Audio Output"), # Output is a text string for the API | |
title="MeloTTS API for Korean", | |
description="A simplified API for MeloTTS, configured for Korean language.", | |
api_name="synthesize" | |
) | |
# The .queue() is important for handling multiple requests on HF Spaces. | |
iface.queue().launch() |