Spaces:

minhhungg
/

melotts-api

Running

App Files Files Community

minhhungg commited on 21 days ago

Commit

d52b07f

verified ·

1 Parent(s): 022bfb3

Create app.py

Browse files

Files changed (1) hide show

app.py +86 -0

app.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# app.py (for MeloTTS API Space)
+import gradio as gr
+import os
+import torch
+import io
+import soundfile as sf
+import base64
+import logging
+# --- Setup ---
+# This command is crucial and needs to run once.
+# It downloads the dictionary needed for Japanese/Korean.
+os.system('python -m unidic download')
+from melo.api import TTS
+# --- Logging ---
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# --- Model Loading ---
+MODEL = None
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+LANGUAGE = 'KR'
+# We will use 0.9 for a slightly faster than normal, clear pace.
+SPEED = 0.9
+try:
+    logger.info(f"Loading MeloTTS model for language: {LANGUAGE} on device: {DEVICE}")
+    MODEL = TTS(language=LANGUAGE, device=DEVICE)
+    SPEAKER_ID = 'KR' # For Korean, the main speaker is just 'KR'
+    logger.info("MeloTTS model loaded successfully.")
+    logger.info(f"Default speaker: {SPEAKER_ID}, Default speed: {SPEED}")
+except Exception as e:
+    logger.exception(f"FATAL: MeloTTS model initialization error: {e}")
+    MODEL = None
+# --- Main TTS Synthesis Function ---
+def synthesize(text_to_synthesize):
+    if not MODEL:
+        raise gr.Error("TTS Model is not loaded. Cannot process request.")
+    if not text_to_synthesize or not text_to_synthesize.strip():
+        # Create and return a silent audio data URI
+        silent_audio = np.zeros(int(0.1 * 24000), dtype=np.int16) # Melo uses 24kHz
+        wav_buffer = io.BytesIO()
+        sf.write(wav_buffer, silent_audio, 24000, format='WAV')
+        wav_buffer.seek(0)
+        wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8')
+        return f"data:audio/wav;base64,{wav_base64}"
+    try:
+        logger.info(f"Synthesizing for text: '{text_to_synthesize[:80]}...'")
+        # Use an in-memory BytesIO object to store the audio
+        wav_buffer = io.BytesIO()
+        MODEL.tts_to_file(text_to_synthesize, SPEAKER_ID, wav_buffer, speed=SPEED, format='wav')
+        # Reset buffer position to the beginning
+        wav_buffer.seek(0)
+        # Read the bytes and encode to base64
+        wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8')
+        logger.info("Synthesis complete.")
+        return f"data:audio/wav;base64,{wav_base64}"
+    except Exception as e:
+        logger.exception(f"MeloTTS synthesis error: {e}")
+        raise gr.Error(f"An error occurred during synthesis: {str(e)}")
+# --- Create and Launch the Gradio Interface ---
+# We create a pure API without a complex UI.
+iface = gr.Interface(
+    fn=synthesize,
+    inputs=gr.Textbox(label="Text to Synthesize"),
+    outputs=gr.Textbox(label="Base64 Audio Output"), # Output is a text string for the API
+    title="MeloTTS API for Korean",
+    description="A simplified API for MeloTTS, configured for Korean language.",
+    api_name="synthesize"
+)
+# The .queue() is important for handling multiple requests on HF Spaces.
+iface.queue().launch()