Spaces:
Running
Running
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import torch
|
3 |
+
import numpy as np
|
4 |
+
import os
|
5 |
+
import io
|
6 |
+
import base64
|
7 |
+
from kokoro import KModel, KPipeline
|
8 |
+
|
9 |
+
# Check if CUDA is available
|
10 |
+
CUDA_AVAILABLE = torch.cuda.is_available()
|
11 |
+
|
12 |
+
# Initialize the model
|
13 |
+
model = KModel().to('cuda' if CUDA_AVAILABLE else 'cpu').eval()
|
14 |
+
|
15 |
+
# Initialize pipelines for different language codes (using 'a' for English)
|
16 |
+
pipelines = {'a': KPipeline(lang_code='a', model=False)}
|
17 |
+
|
18 |
+
# Custom pronunciation for "kokoro"
|
19 |
+
pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
|
20 |
+
|
21 |
+
def text_to_audio(text, speed=1.0):
|
22 |
+
"""Convert text to audio using Kokoro model.
|
23 |
+
|
24 |
+
Args:
|
25 |
+
text: The text to convert to speech
|
26 |
+
speed: Speech speed multiplier (0.5-2.0, where 1.0 is normal speed)
|
27 |
+
|
28 |
+
Returns:
|
29 |
+
Audio data as a tuple of (sample_rate, audio_array)
|
30 |
+
"""
|
31 |
+
if not text:
|
32 |
+
return None
|
33 |
+
|
34 |
+
pipeline = pipelines['a'] # Use English pipeline
|
35 |
+
voice = "af_heart" # Default voice (US English, female, Heart)
|
36 |
+
|
37 |
+
# Process the text
|
38 |
+
pack = pipeline.load_voice(voice)
|
39 |
+
|
40 |
+
for _, ps, _ in pipeline(text, voice, speed):
|
41 |
+
ref_s = pack[len(ps)-1]
|
42 |
+
|
43 |
+
# Generate audio
|
44 |
+
try:
|
45 |
+
audio = model(ps, ref_s, speed)
|
46 |
+
except Exception as e:
|
47 |
+
raise gr.Error(f"Error generating audio: {str(e)}")
|
48 |
+
|
49 |
+
# Return the audio with 24kHz sample rate
|
50 |
+
return 24000, audio.numpy()
|
51 |
+
|
52 |
+
return None
|
53 |
+
|
54 |
+
def text_to_audio_b64(text, speed=1.0):
|
55 |
+
"""Convert text to audio and return as base64 encoded WAV file.
|
56 |
+
|
57 |
+
Args:
|
58 |
+
text: The text to convert to speech
|
59 |
+
speed: Speech speed multiplier (0.5-2.0, where 1.0 is normal speed)
|
60 |
+
|
61 |
+
Returns:
|
62 |
+
Base64 encoded WAV file as a string
|
63 |
+
"""
|
64 |
+
import soundfile as sf
|
65 |
+
|
66 |
+
result = text_to_audio(text, speed)
|
67 |
+
if result is None:
|
68 |
+
return None
|
69 |
+
|
70 |
+
sample_rate, audio_data = result
|
71 |
+
|
72 |
+
# Save to BytesIO object
|
73 |
+
wav_io = io.BytesIO()
|
74 |
+
sf.write(wav_io, audio_data, sample_rate, format='WAV')
|
75 |
+
wav_io.seek(0)
|
76 |
+
|
77 |
+
# Convert to base64
|
78 |
+
wav_b64 = base64.b64encode(wav_io.read()).decode('utf-8')
|
79 |
+
return wav_b64
|
80 |
+
|
81 |
+
# Create Gradio interface
|
82 |
+
with gr.Blocks(title="Kokoro Text-to-Audio MCP") as app:
|
83 |
+
gr.Markdown("# 🎵 Kokoro Text-to-Audio MCP")
|
84 |
+
gr.Markdown("Convert text to speech using the Kokoro-82M model")
|
85 |
+
|
86 |
+
with gr.Row():
|
87 |
+
with gr.Column():
|
88 |
+
text_input = gr.Textbox(
|
89 |
+
label="Enter your text",
|
90 |
+
placeholder="Type something to convert to audio...",
|
91 |
+
lines=5
|
92 |
+
)
|
93 |
+
speed_slider = gr.Slider(
|
94 |
+
minimum=0.5,
|
95 |
+
maximum=2.0,
|
96 |
+
value=1.0,
|
97 |
+
step=0.1,
|
98 |
+
label="Speech Speed"
|
99 |
+
)
|
100 |
+
submit_btn = gr.Button("Generate Audio")
|
101 |
+
|
102 |
+
with gr.Column():
|
103 |
+
audio_output = gr.Audio(label="Generated Audio", type="numpy")
|
104 |
+
|
105 |
+
submit_btn.click(
|
106 |
+
fn=text_to_audio,
|
107 |
+
inputs=[text_input, speed_slider],
|
108 |
+
outputs=[audio_output]
|
109 |
+
)
|
110 |
+
|
111 |
+
gr.Markdown("### Usage Tips")
|
112 |
+
gr.Markdown("- Adjust the speed slider to modify the pace of speech")
|
113 |
+
|
114 |
+
# Add section about MCP support
|
115 |
+
with gr.Accordion("MCP Support (for LLMs)", open=False):
|
116 |
+
gr.Markdown("""
|
117 |
+
### MCP Support
|
118 |
+
|
119 |
+
This app supports the Model Context Protocol (MCP), allowing Large Language Models like Claude Desktop to use it as a tool.
|
120 |
+
|
121 |
+
To use this app with an MCP client, add the following configuration:
|
122 |
+
|
123 |
+
```json
|
124 |
+
{
|
125 |
+
"mcpServers": {
|
126 |
+
"kokoroTTS": {
|
127 |
+
"url": "https://fdaudens-kokoro-mcp.hf.space/gradio_api/mcp/sse"
|
128 |
+
}
|
129 |
+
}
|
130 |
+
}
|
131 |
+
```
|
132 |
+
|
133 |
+
Replace `your-app-url.hf.space` with your actual Hugging Face Space URL.
|
134 |
+
""")
|
135 |
+
|
136 |
+
# Launch the app with MCP support
|
137 |
+
if __name__ == "__main__":
|
138 |
+
# Check for environment variable to enable MCP
|
139 |
+
enable_mcp = os.environ.get('GRADIO_MCP_SERVER', 'False').lower() in ('true', '1', 't')
|
140 |
+
|
141 |
+
app.launch(mcp_server=True)
|