minhhungg commited on
Commit
d52b07f
·
verified ·
1 Parent(s): 022bfb3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -0
app.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py (for MeloTTS API Space)
2
+
3
+ import gradio as gr
4
+ import os
5
+ import torch
6
+ import io
7
+ import soundfile as sf
8
+ import base64
9
+ import logging
10
+
11
+ # --- Setup ---
12
+ # This command is crucial and needs to run once.
13
+ # It downloads the dictionary needed for Japanese/Korean.
14
+ os.system('python -m unidic download')
15
+
16
+ from melo.api import TTS
17
+
18
+ # --- Logging ---
19
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # --- Model Loading ---
23
+ MODEL = None
24
+ DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
25
+ LANGUAGE = 'KR'
26
+ # We will use 0.9 for a slightly faster than normal, clear pace.
27
+ SPEED = 0.9
28
+
29
+ try:
30
+ logger.info(f"Loading MeloTTS model for language: {LANGUAGE} on device: {DEVICE}")
31
+ MODEL = TTS(language=LANGUAGE, device=DEVICE)
32
+ SPEAKER_ID = 'KR' # For Korean, the main speaker is just 'KR'
33
+ logger.info("MeloTTS model loaded successfully.")
34
+ logger.info(f"Default speaker: {SPEAKER_ID}, Default speed: {SPEED}")
35
+ except Exception as e:
36
+ logger.exception(f"FATAL: MeloTTS model initialization error: {e}")
37
+ MODEL = None
38
+
39
+ # --- Main TTS Synthesis Function ---
40
+ def synthesize(text_to_synthesize):
41
+ if not MODEL:
42
+ raise gr.Error("TTS Model is not loaded. Cannot process request.")
43
+
44
+ if not text_to_synthesize or not text_to_synthesize.strip():
45
+ # Create and return a silent audio data URI
46
+ silent_audio = np.zeros(int(0.1 * 24000), dtype=np.int16) # Melo uses 24kHz
47
+ wav_buffer = io.BytesIO()
48
+ sf.write(wav_buffer, silent_audio, 24000, format='WAV')
49
+ wav_buffer.seek(0)
50
+ wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8')
51
+ return f"data:audio/wav;base64,{wav_base64}"
52
+
53
+ try:
54
+ logger.info(f"Synthesizing for text: '{text_to_synthesize[:80]}...'")
55
+
56
+ # Use an in-memory BytesIO object to store the audio
57
+ wav_buffer = io.BytesIO()
58
+ MODEL.tts_to_file(text_to_synthesize, SPEAKER_ID, wav_buffer, speed=SPEED, format='wav')
59
+
60
+ # Reset buffer position to the beginning
61
+ wav_buffer.seek(0)
62
+
63
+ # Read the bytes and encode to base64
64
+ wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8')
65
+
66
+ logger.info("Synthesis complete.")
67
+ return f"data:audio/wav;base64,{wav_base64}"
68
+
69
+ except Exception as e:
70
+ logger.exception(f"MeloTTS synthesis error: {e}")
71
+ raise gr.Error(f"An error occurred during synthesis: {str(e)}")
72
+
73
+
74
+ # --- Create and Launch the Gradio Interface ---
75
+ # We create a pure API without a complex UI.
76
+ iface = gr.Interface(
77
+ fn=synthesize,
78
+ inputs=gr.Textbox(label="Text to Synthesize"),
79
+ outputs=gr.Textbox(label="Base64 Audio Output"), # Output is a text string for the API
80
+ title="MeloTTS API for Korean",
81
+ description="A simplified API for MeloTTS, configured for Korean language.",
82
+ api_name="synthesize"
83
+ )
84
+
85
+ # The .queue() is important for handling multiple requests on HF Spaces.
86
+ iface.queue().launch()