Spaces:
Running
Running
File size: 3,019 Bytes
d52b07f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
# app.py (for MeloTTS API Space)
import gradio as gr
import os
import torch
import io
import soundfile as sf
import base64
import logging
# --- Setup ---
# This command is crucial and needs to run once.
# It downloads the dictionary needed for Japanese/Korean.
os.system('python -m unidic download')
from melo.api import TTS
# --- Logging ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# --- Model Loading ---
MODEL = None
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
LANGUAGE = 'KR'
# We will use 0.9 for a slightly faster than normal, clear pace.
SPEED = 0.9
try:
logger.info(f"Loading MeloTTS model for language: {LANGUAGE} on device: {DEVICE}")
MODEL = TTS(language=LANGUAGE, device=DEVICE)
SPEAKER_ID = 'KR' # For Korean, the main speaker is just 'KR'
logger.info("MeloTTS model loaded successfully.")
logger.info(f"Default speaker: {SPEAKER_ID}, Default speed: {SPEED}")
except Exception as e:
logger.exception(f"FATAL: MeloTTS model initialization error: {e}")
MODEL = None
# --- Main TTS Synthesis Function ---
def synthesize(text_to_synthesize):
if not MODEL:
raise gr.Error("TTS Model is not loaded. Cannot process request.")
if not text_to_synthesize or not text_to_synthesize.strip():
# Create and return a silent audio data URI
silent_audio = np.zeros(int(0.1 * 24000), dtype=np.int16) # Melo uses 24kHz
wav_buffer = io.BytesIO()
sf.write(wav_buffer, silent_audio, 24000, format='WAV')
wav_buffer.seek(0)
wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8')
return f"data:audio/wav;base64,{wav_base64}"
try:
logger.info(f"Synthesizing for text: '{text_to_synthesize[:80]}...'")
# Use an in-memory BytesIO object to store the audio
wav_buffer = io.BytesIO()
MODEL.tts_to_file(text_to_synthesize, SPEAKER_ID, wav_buffer, speed=SPEED, format='wav')
# Reset buffer position to the beginning
wav_buffer.seek(0)
# Read the bytes and encode to base64
wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8')
logger.info("Synthesis complete.")
return f"data:audio/wav;base64,{wav_base64}"
except Exception as e:
logger.exception(f"MeloTTS synthesis error: {e}")
raise gr.Error(f"An error occurred during synthesis: {str(e)}")
# --- Create and Launch the Gradio Interface ---
# We create a pure API without a complex UI.
iface = gr.Interface(
fn=synthesize,
inputs=gr.Textbox(label="Text to Synthesize"),
outputs=gr.Textbox(label="Base64 Audio Output"), # Output is a text string for the API
title="MeloTTS API for Korean",
description="A simplified API for MeloTTS, configured for Korean language.",
api_name="synthesize"
)
# The .queue() is important for handling multiple requests on HF Spaces.
iface.queue().launch() |