File size: 3,350 Bytes
146a956
d52b07f
 
 
 
146a956
 
d52b07f
 
 
 
146a956
d52b07f
 
 
 
146a956
d52b07f
 
 
146a956
 
d52b07f
146a956
 
 
 
d52b07f
146a956
 
d52b07f
146a956
 
d52b07f
 
 
146a956
d52b07f
 
146a956
 
 
 
 
d52b07f
 
146a956
 
d52b07f
 
 
 
 
 
 
 
 
146a956
d52b07f
146a956
 
 
 
 
 
 
 
 
d52b07f
 
 
 
146a956
d52b07f
 
 
146a956
 
d52b07f
 
 
146a956
d52b07f
 
 
146a956
d52b07f
 
 
146a956
 
 
d52b07f
 
 
146a956
d52b07f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# app.py (for your new MeloTTS space)

import gradio as gr
import torch
import io
import os
import numpy as np
import soundfile as sf
import base64
import logging

# This command is important and should run at the start
os.system('python -m unidic download')

from melo.api import TTS

# --- Setup Logging ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- Configuration ---
# We pre-configure everything here.
LANGUAGE = 'KR'
# NOTE: A speed of 0.1 is extremely slow. 0.8 is a good starting point. Adjust if needed.
SPEED = 0.8 
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
SPEAKER_ID = 'KR' # Default Korean speaker

# --- Load Model (this happens only once when the space starts) ---
MODEL_INSTANCE = None
try:
    logger.info(f"Loading MeloTTS model for language: {LANGUAGE} on device: {DEVICE}...")
    MODEL_INSTANCE = TTS(language=LANGUAGE, device=DEVICE)
    logger.info("MeloTTS model loaded successfully.")
except Exception as e:
    logger.exception(f"FATAL: MeloTTS model initialization error: {e}")
    MODEL_INSTANCE = None

def synthesize(text_to_synthesize):
    """
    Takes text input and returns a base64 encoded WAV audio data URI string.
    """
    if not MODEL_INSTANCE:
        raise gr.Error("TTS Model is not available. Cannot process request.")

    if not text_to_synthesize or not text_to_synthesize.strip():
        # Create and return a silent audio data URI for empty input
        silent_audio = np.zeros(int(0.1 * 24000), dtype=np.int16)
        wav_buffer = io.BytesIO()
        sf.write(wav_buffer, silent_audio, 24000, format='WAV')
        wav_buffer.seek(0)
        wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8')
        return f"data:audio/wav;base64,{wav_base64}"

    try:
        logger.info(f"Synthesizing for text: '{text_to_synthesize[:80]}...'")
        
        # Use an in-memory BytesIO object to hold the audio data
        wav_buffer = io.BytesIO()
        
        # Synthesize audio directly to the buffer
        MODEL_INSTANCE.tts_to_file(
            text_to_synthesize, 
            MODEL_INSTANCE.hps.data.spk2id[SPEAKER_ID], 
            wav_buffer, 
            speed=SPEED,
            format='wav'
        )
        
        # Reset buffer position to the beginning
        wav_buffer.seek(0)
        
        # Encode the bytes to base64
        wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8')
        
        logger.info("Synthesis complete.")
        
        # Return the data URI string our React app expects
        return f"data:audio/wav;base64,{wav_base64}"

    except Exception as e:
        logger.exception(f"TTS synthesis error: {e}")
        raise gr.Error(f"An error occurred during synthesis: {str(e)}")

# --- Create and Launch the Gradio Interface ---
# We create a pure API with no complex UI. This is fast and reliable.
iface = gr.Interface(
    fn=synthesize,
    inputs=gr.Textbox(label="Text to Synthesize"),
    outputs="text", # The API will return a simple text string (our base64 URI)
    title="MeloTTS API",
    description="A simplified API for MeloTTS. Pre-configured for Korean at 0.8 speed.",
    api_name="synthesize"
)

# The .queue() helps manage traffic and is recommended for public APIs.
iface.queue().launch()