Spaces:
Running
Running
File size: 3,350 Bytes
146a956 d52b07f 146a956 d52b07f 146a956 d52b07f 146a956 d52b07f 146a956 d52b07f 146a956 d52b07f 146a956 d52b07f 146a956 d52b07f 146a956 d52b07f 146a956 d52b07f 146a956 d52b07f 146a956 d52b07f 146a956 d52b07f 146a956 d52b07f 146a956 d52b07f 146a956 d52b07f 146a956 d52b07f 146a956 d52b07f 146a956 d52b07f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
# app.py (for your new MeloTTS space)
import gradio as gr
import torch
import io
import os
import numpy as np
import soundfile as sf
import base64
import logging
# This command is important and should run at the start
os.system('python -m unidic download')
from melo.api import TTS
# --- Setup Logging ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# --- Configuration ---
# We pre-configure everything here.
LANGUAGE = 'KR'
# NOTE: A speed of 0.1 is extremely slow. 0.8 is a good starting point. Adjust if needed.
SPEED = 0.8
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
SPEAKER_ID = 'KR' # Default Korean speaker
# --- Load Model (this happens only once when the space starts) ---
MODEL_INSTANCE = None
try:
logger.info(f"Loading MeloTTS model for language: {LANGUAGE} on device: {DEVICE}...")
MODEL_INSTANCE = TTS(language=LANGUAGE, device=DEVICE)
logger.info("MeloTTS model loaded successfully.")
except Exception as e:
logger.exception(f"FATAL: MeloTTS model initialization error: {e}")
MODEL_INSTANCE = None
def synthesize(text_to_synthesize):
"""
Takes text input and returns a base64 encoded WAV audio data URI string.
"""
if not MODEL_INSTANCE:
raise gr.Error("TTS Model is not available. Cannot process request.")
if not text_to_synthesize or not text_to_synthesize.strip():
# Create and return a silent audio data URI for empty input
silent_audio = np.zeros(int(0.1 * 24000), dtype=np.int16)
wav_buffer = io.BytesIO()
sf.write(wav_buffer, silent_audio, 24000, format='WAV')
wav_buffer.seek(0)
wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8')
return f"data:audio/wav;base64,{wav_base64}"
try:
logger.info(f"Synthesizing for text: '{text_to_synthesize[:80]}...'")
# Use an in-memory BytesIO object to hold the audio data
wav_buffer = io.BytesIO()
# Synthesize audio directly to the buffer
MODEL_INSTANCE.tts_to_file(
text_to_synthesize,
MODEL_INSTANCE.hps.data.spk2id[SPEAKER_ID],
wav_buffer,
speed=SPEED,
format='wav'
)
# Reset buffer position to the beginning
wav_buffer.seek(0)
# Encode the bytes to base64
wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8')
logger.info("Synthesis complete.")
# Return the data URI string our React app expects
return f"data:audio/wav;base64,{wav_base64}"
except Exception as e:
logger.exception(f"TTS synthesis error: {e}")
raise gr.Error(f"An error occurred during synthesis: {str(e)}")
# --- Create and Launch the Gradio Interface ---
# We create a pure API with no complex UI. This is fast and reliable.
iface = gr.Interface(
fn=synthesize,
inputs=gr.Textbox(label="Text to Synthesize"),
outputs="text", # The API will return a simple text string (our base64 URI)
title="MeloTTS API",
description="A simplified API for MeloTTS. Pre-configured for Korean at 0.8 speed.",
api_name="synthesize"
)
# The .queue() helps manage traffic and is recommended for public APIs.
iface.queue().launch() |