Spaces:

minhhungg
/

melotts-api

Running

App Files Files Community

melotts-api / app.py

minhhungg

Create app.py

d52b07f verified 28 days ago

raw

history blame

3.02 kB

	# app.py (for MeloTTS API Space)

	import gradio as gr
	import os
	import torch
	import io
	import soundfile as sf
	import base64
	import logging

	# --- Setup ---
	# This command is crucial and needs to run once.
	# It downloads the dictionary needed for Japanese/Korean.
	os.system('python -m unidic download')

	from melo.api import TTS

	# --- Logging ---
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	# --- Model Loading ---
	MODEL = None
	DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
	LANGUAGE = 'KR'
	# We will use 0.9 for a slightly faster than normal, clear pace.
	SPEED = 0.9

	try:
	logger.info(f"Loading MeloTTS model for language: {LANGUAGE} on device: {DEVICE}")
	MODEL = TTS(language=LANGUAGE, device=DEVICE)
	SPEAKER_ID = 'KR' # For Korean, the main speaker is just 'KR'
	logger.info("MeloTTS model loaded successfully.")
	logger.info(f"Default speaker: {SPEAKER_ID}, Default speed: {SPEED}")
	except Exception as e:
	logger.exception(f"FATAL: MeloTTS model initialization error: {e}")
	MODEL = None

	# --- Main TTS Synthesis Function ---
	def synthesize(text_to_synthesize):
	if not MODEL:
	raise gr.Error("TTS Model is not loaded. Cannot process request.")

	if not text_to_synthesize or not text_to_synthesize.strip():
	# Create and return a silent audio data URI
	silent_audio = np.zeros(int(0.1 * 24000), dtype=np.int16) # Melo uses 24kHz
	wav_buffer = io.BytesIO()
	sf.write(wav_buffer, silent_audio, 24000, format='WAV')
	wav_buffer.seek(0)
	wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8')
	return f"data:audio/wav;base64,{wav_base64}"

	try:
	logger.info(f"Synthesizing for text: '{text_to_synthesize[:80]}...'")

	# Use an in-memory BytesIO object to store the audio
	wav_buffer = io.BytesIO()
	MODEL.tts_to_file(text_to_synthesize, SPEAKER_ID, wav_buffer, speed=SPEED, format='wav')

	# Reset buffer position to the beginning
	wav_buffer.seek(0)

	# Read the bytes and encode to base64
	wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8')

	logger.info("Synthesis complete.")
	return f"data:audio/wav;base64,{wav_base64}"

	except Exception as e:
	logger.exception(f"MeloTTS synthesis error: {e}")
	raise gr.Error(f"An error occurred during synthesis: {str(e)}")


	# --- Create and Launch the Gradio Interface ---
	# We create a pure API without a complex UI.
	iface = gr.Interface(
	fn=synthesize,
	inputs=gr.Textbox(label="Text to Synthesize"),
	outputs=gr.Textbox(label="Base64 Audio Output"), # Output is a text string for the API
	title="MeloTTS API for Korean",
	description="A simplified API for MeloTTS, configured for Korean language.",
	api_name="synthesize"
	)

	# The .queue() is important for handling multiple requests on HF Spaces.
	iface.queue().launch()