Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- app.py +33 -45
- requirements.txt +0 -1
app.py
CHANGED
|
@@ -2,7 +2,6 @@ from huggingface_hub import snapshot_download
|
|
| 2 |
from katsu import Katsu
|
| 3 |
from models import build_model
|
| 4 |
import gradio as gr
|
| 5 |
-
import noisereduce as nr
|
| 6 |
import numpy as np
|
| 7 |
import os
|
| 8 |
import phonemizer
|
|
@@ -112,33 +111,15 @@ def tokenize(ps):
|
|
| 112 |
# ⭐ Starred voices are averages of similar voices. 🧪 Experimental voices may be unstable.
|
| 113 |
CHOICES = {
|
| 114 |
'🇺🇸 🚺 American Female ⭐': 'af',
|
| 115 |
-
'🇺🇸 🚺 American Female 1': 'af_1',
|
| 116 |
-
'🇺🇸 🚺 Alloy 🧪': 'af_alloy',
|
| 117 |
'🇺🇸 🚺 Bella': 'af_bella',
|
| 118 |
-
'🇺🇸 🚺 Jessica 🧪': 'af_jessica',
|
| 119 |
-
'🇺🇸 🚺 Nicole': 'af_nicole',
|
| 120 |
-
'🇺🇸 🚺 Nova 🧪': 'af_nova',
|
| 121 |
-
'🇺🇸 🚺 River 🧪': 'af_river',
|
| 122 |
'🇺🇸 🚺 Sarah': 'af_sarah',
|
| 123 |
'🇺🇸 🚺 Sky 🧪': 'af_sky',
|
| 124 |
-
'🇺🇸 🚹 Adam': 'am_adam',
|
| 125 |
-
'🇺🇸 🚹 Echo 🧪': 'am_echo',
|
| 126 |
-
'🇺🇸 🚹 Eric 🧪': 'am_eric',
|
| 127 |
-
'🇺🇸 🚹 Liam 🧪': 'am_liam',
|
| 128 |
'🇺🇸 🚹 Michael': 'am_michael',
|
| 129 |
-
'
|
| 130 |
-
'
|
| 131 |
-
'🇬🇧 🚺 Alice 🧪': 'bf_alice',
|
| 132 |
-
'🇬🇧 🚺 Lily 🧪': 'bf_lily',
|
| 133 |
-
'🇬🇧 🚹 British Male 0': 'bm_0',
|
| 134 |
-
'🇬🇧 🚹 British Male 1': 'bm_1',
|
| 135 |
-
'🇬🇧 🚹 British Male 2': 'bm_2',
|
| 136 |
-
'🇬🇧 🚹 Daniel 🧪': 'bm_daniel',
|
| 137 |
-
'🇬🇧 🚹 Fable 🧪': 'bm_fable',
|
| 138 |
-
'🇬🇧 🚹 George 🧪': 'bm_george',
|
| 139 |
-
'🇯🇵 🚺 Japanese Female 0': 'jf_0',
|
| 140 |
}
|
| 141 |
-
VOICES = {k: torch.load(os.path.join(snapshot, '
|
| 142 |
|
| 143 |
np_log_99 = np.log(99)
|
| 144 |
def s_curve(p):
|
|
@@ -155,7 +136,7 @@ SAMPLE_RATE = 24000
|
|
| 155 |
@spaces.GPU(duration=10)
|
| 156 |
@torch.no_grad()
|
| 157 |
def forward(tokens, voice, speed):
|
| 158 |
-
ref_s = VOICES[voice]
|
| 159 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
| 160 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
| 161 |
text_mask = length_to_mask(input_lengths).to(device)
|
|
@@ -178,7 +159,7 @@ def forward(tokens, voice, speed):
|
|
| 178 |
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
|
| 179 |
return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
|
| 180 |
|
| 181 |
-
def generate(text, voice, ps=None, speed=1.0,
|
| 182 |
if voice not in VOICES:
|
| 183 |
# Ensure stability for https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
|
| 184 |
voice = 'af'
|
|
@@ -194,8 +175,6 @@ def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=4000
|
|
| 194 |
except gr.exceptions.Error as e:
|
| 195 |
raise gr.Error(e)
|
| 196 |
return (None, '')
|
| 197 |
-
if reduce_noise > 0:
|
| 198 |
-
out = nr.reduce_noise(y=out, sr=SAMPLE_RATE, prop_decrease=reduce_noise, n_fft=512)
|
| 199 |
opening_cut = int(opening_cut / speed)
|
| 200 |
if opening_cut > 0:
|
| 201 |
out = out[opening_cut:]
|
|
@@ -216,6 +195,9 @@ def generate(text, voice, ps=None, speed=1.0, reduce_noise=0.5, opening_cut=4000
|
|
| 216 |
out = np.concatenate([out, np.zeros(pad_after)])
|
| 217 |
return ((SAMPLE_RATE, out), ps)
|
| 218 |
|
|
|
|
|
|
|
|
|
|
| 219 |
with gr.Blocks() as basic_tts:
|
| 220 |
with gr.Row():
|
| 221 |
gr.Markdown('Generate speech for one segment of text (up to 510 tokens) using Kokoro, a TTS model with 80 million parameters.')
|
|
@@ -234,12 +216,12 @@ with gr.Blocks() as basic_tts:
|
|
| 234 |
phonemize_btn = gr.Button('Tokenize Input Text', variant='primary')
|
| 235 |
phonemize_btn.click(phonemize, inputs=[text, voice], outputs=[in_ps])
|
| 236 |
with gr.Column():
|
| 237 |
-
audio = gr.Audio(interactive=False, label='Output Audio')
|
| 238 |
with gr.Accordion('Output Tokens', open=True):
|
| 239 |
out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
|
| 240 |
with gr.Accordion('Audio Settings', open=False):
|
| 241 |
with gr.Row():
|
| 242 |
-
|
| 243 |
with gr.Row():
|
| 244 |
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
|
| 245 |
with gr.Row():
|
|
@@ -257,15 +239,18 @@ with gr.Blocks() as basic_tts:
|
|
| 257 |
pad_before = gr.Slider(minimum=0, maximum=24000, value=5000, step=1000, label='Pad Before', info='🔇 How many samples of silence to insert before the start.')
|
| 258 |
with gr.Column():
|
| 259 |
pad_after = gr.Slider(minimum=0, maximum=24000, value=5000, step=1000, label='Pad After', info='🔇 How many samples of silence to append after the end.')
|
| 260 |
-
|
|
|
|
|
|
|
| 261 |
|
| 262 |
@spaces.GPU
|
| 263 |
@torch.no_grad()
|
| 264 |
def lf_forward(token_lists, voice, speed):
|
| 265 |
-
|
| 266 |
-
s = ref_s[:, 128:]
|
| 267 |
outs = []
|
| 268 |
for tokens in token_lists:
|
|
|
|
|
|
|
| 269 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
| 270 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
| 271 |
text_mask = length_to_mask(input_lengths).to(device)
|
|
@@ -340,7 +325,7 @@ def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2
|
|
| 340 |
segments = [row for t in texts for row in recursive_split(t, voice)]
|
| 341 |
return [(i, *row) for i, row in enumerate(segments)]
|
| 342 |
|
| 343 |
-
def lf_generate(segments, voice, speed=1.0,
|
| 344 |
token_lists = list(map(tokenize, segments['Tokens']))
|
| 345 |
wavs = []
|
| 346 |
opening_cut = int(opening_cut / speed)
|
|
@@ -357,8 +342,6 @@ def lf_generate(segments, voice, speed=1.0, reduce_noise=0.5, opening_cut=4000,
|
|
| 357 |
raise gr.Error(e)
|
| 358 |
break
|
| 359 |
for out in outs:
|
| 360 |
-
if reduce_noise > 0:
|
| 361 |
-
out = nr.reduce_noise(y=out, sr=SAMPLE_RATE, prop_decrease=reduce_noise, n_fft=512)
|
| 362 |
if opening_cut > 0:
|
| 363 |
out = out[opening_cut:]
|
| 364 |
if closing_cut > 0:
|
|
@@ -415,8 +398,6 @@ with gr.Blocks() as lf_tts:
|
|
| 415 |
with gr.Column():
|
| 416 |
audio = gr.Audio(interactive=False, label='Output Audio')
|
| 417 |
with gr.Accordion('Audio Settings', open=False):
|
| 418 |
-
with gr.Row():
|
| 419 |
-
reduce_noise = gr.Slider(minimum=0, maximum=1, value=0.5, label='Reduce Noise', info='👻 Fix it in post: non-stationary noise reduction via spectral gating.')
|
| 420 |
with gr.Row():
|
| 421 |
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
|
| 422 |
with gr.Row():
|
|
@@ -440,7 +421,7 @@ with gr.Blocks() as lf_tts:
|
|
| 440 |
segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
|
| 441 |
segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
|
| 442 |
segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
|
| 443 |
-
generate_btn.click(lf_generate, inputs=[segments, voice, speed,
|
| 444 |
|
| 445 |
with gr.Blocks() as about:
|
| 446 |
gr.Markdown("""
|
|
@@ -453,11 +434,6 @@ The model was trained on 1x A100-class 80GB instances rented from [Vast.ai](http
|
|
| 453 |
Vast was chosen over other compute providers due to its competitive on-demand hourly rates.<br/>
|
| 454 |
The average hourly cost for the 1x A100-class 80GB VRAM instances used for training was below $1/hr — around half the quoted rates from other providers.
|
| 455 |
|
| 456 |
-
### Updates
|
| 457 |
-
This Space and the underlying Kokoro model are both under development and subject to change.<br/>
|
| 458 |
-
Last model update: 2024 Nov 15<br/>
|
| 459 |
-
Model trained by: Raven (@rzvzn on Discord)
|
| 460 |
-
|
| 461 |
### Licenses
|
| 462 |
Inference code: MIT<br/>
|
| 463 |
espeak-ng dependency: GPL-3.0<sup>[4]</sup><br/>
|
|
@@ -471,6 +447,9 @@ Random Japanese texts: CC0 public domain<sup>[6]</sup>
|
|
| 471 |
4. eSpeak NG | https://github.com/espeak-ng/espeak-ng
|
| 472 |
5. Quotable Data | https://github.com/quotable-io/data/blob/master/data/quotes.json
|
| 473 |
6. Common Voice Japanese sentences | https://github.com/common-voice/common-voice/tree/main/server/data/ja
|
|
|
|
|
|
|
|
|
|
| 474 |
""")
|
| 475 |
|
| 476 |
with gr.Blocks() as api_info:
|
|
@@ -499,10 +478,19 @@ print(out_ps)
|
|
| 499 |
Note that this Space and the underlying Kokoro model are both under development and subject to change. Reliability is not guaranteed. Hugging Face and/or Gradio might enforce their own rate limits.
|
| 500 |
""")
|
| 501 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 502 |
with gr.Blocks() as app:
|
| 503 |
gr.TabbedInterface(
|
| 504 |
-
[basic_tts, lf_tts, about, api_info],
|
| 505 |
-
['🗣️ Basic TTS', '📖 Long-Form', 'ℹ️ About', '🚀 Gradio API'],
|
| 506 |
)
|
| 507 |
|
| 508 |
if __name__ == '__main__':
|
|
|
|
| 2 |
from katsu import Katsu
|
| 3 |
from models import build_model
|
| 4 |
import gradio as gr
|
|
|
|
| 5 |
import numpy as np
|
| 6 |
import os
|
| 7 |
import phonemizer
|
|
|
|
| 111 |
# ⭐ Starred voices are averages of similar voices. 🧪 Experimental voices may be unstable.
|
| 112 |
CHOICES = {
|
| 113 |
'🇺🇸 🚺 American Female ⭐': 'af',
|
|
|
|
|
|
|
| 114 |
'🇺🇸 🚺 Bella': 'af_bella',
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
'🇺🇸 🚺 Sarah': 'af_sarah',
|
| 116 |
'🇺🇸 🚺 Sky 🧪': 'af_sky',
|
| 117 |
+
'🇺🇸 🚹 Adam 🧪': 'am_adam',
|
|
|
|
|
|
|
|
|
|
| 118 |
'🇺🇸 🚹 Michael': 'am_michael',
|
| 119 |
+
'🇬🇧 🚹 Lewis': 'bm_lewis',
|
| 120 |
+
'🇯🇵 🚺 Japanese Female 🧪': 'jf_0',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
}
|
| 122 |
+
VOICES = {k: torch.load(os.path.join(snapshot, 'voicepacks', f'{k}.pt'), weights_only=True).to(device) for k in CHOICES.values()}
|
| 123 |
|
| 124 |
np_log_99 = np.log(99)
|
| 125 |
def s_curve(p):
|
|
|
|
| 136 |
@spaces.GPU(duration=10)
|
| 137 |
@torch.no_grad()
|
| 138 |
def forward(tokens, voice, speed):
|
| 139 |
+
ref_s = VOICES[voice][len(tokens)]
|
| 140 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
| 141 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
| 142 |
text_mask = length_to_mask(input_lengths).to(device)
|
|
|
|
| 159 |
asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
|
| 160 |
return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
|
| 161 |
|
| 162 |
+
def generate(text, voice, ps=None, speed=1.0, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000, pad_before=5000, pad_after=5000):
|
| 163 |
if voice not in VOICES:
|
| 164 |
# Ensure stability for https://huggingface.co/spaces/Pendrokar/TTS-Spaces-Arena
|
| 165 |
voice = 'af'
|
|
|
|
| 175 |
except gr.exceptions.Error as e:
|
| 176 |
raise gr.Error(e)
|
| 177 |
return (None, '')
|
|
|
|
|
|
|
| 178 |
opening_cut = int(opening_cut / speed)
|
| 179 |
if opening_cut > 0:
|
| 180 |
out = out[opening_cut:]
|
|
|
|
| 195 |
out = np.concatenate([out, np.zeros(pad_after)])
|
| 196 |
return ((SAMPLE_RATE, out), ps)
|
| 197 |
|
| 198 |
+
def toggle_autoplay(autoplay):
|
| 199 |
+
return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay)
|
| 200 |
+
|
| 201 |
with gr.Blocks() as basic_tts:
|
| 202 |
with gr.Row():
|
| 203 |
gr.Markdown('Generate speech for one segment of text (up to 510 tokens) using Kokoro, a TTS model with 80 million parameters.')
|
|
|
|
| 216 |
phonemize_btn = gr.Button('Tokenize Input Text', variant='primary')
|
| 217 |
phonemize_btn.click(phonemize, inputs=[text, voice], outputs=[in_ps])
|
| 218 |
with gr.Column():
|
| 219 |
+
audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
|
| 220 |
with gr.Accordion('Output Tokens', open=True):
|
| 221 |
out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 allowed. Same as input tokens if supplied, excluding unknowns.')
|
| 222 |
with gr.Accordion('Audio Settings', open=False):
|
| 223 |
with gr.Row():
|
| 224 |
+
autoplay = gr.Checkbox(value=True, label='Autoplay')
|
| 225 |
with gr.Row():
|
| 226 |
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
|
| 227 |
with gr.Row():
|
|
|
|
| 239 |
pad_before = gr.Slider(minimum=0, maximum=24000, value=5000, step=1000, label='Pad Before', info='🔇 How many samples of silence to insert before the start.')
|
| 240 |
with gr.Column():
|
| 241 |
pad_after = gr.Slider(minimum=0, maximum=24000, value=5000, step=1000, label='Pad After', info='🔇 How many samples of silence to append after the end.')
|
| 242 |
+
autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
|
| 243 |
+
text.submit(generate, inputs=[text, voice, in_ps, speed, opening_cut, closing_cut, ease_in, ease_out, pad_before, pad_after], outputs=[audio, out_ps])
|
| 244 |
+
generate_btn.click(generate, inputs=[text, voice, in_ps, speed, opening_cut, closing_cut, ease_in, ease_out, pad_before, pad_after], outputs=[audio, out_ps])
|
| 245 |
|
| 246 |
@spaces.GPU
|
| 247 |
@torch.no_grad()
|
| 248 |
def lf_forward(token_lists, voice, speed):
|
| 249 |
+
voicepack = VOICES[voice]
|
|
|
|
| 250 |
outs = []
|
| 251 |
for tokens in token_lists:
|
| 252 |
+
ref_s = voicepack[len(tokens)]
|
| 253 |
+
s = ref_s[:, 128:]
|
| 254 |
tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
|
| 255 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
| 256 |
text_mask = length_to_mask(input_lengths).to(device)
|
|
|
|
| 325 |
segments = [row for t in texts for row in recursive_split(t, voice)]
|
| 326 |
return [(i, *row) for i, row in enumerate(segments)]
|
| 327 |
|
| 328 |
+
def lf_generate(segments, voice, speed=1.0, opening_cut=4000, closing_cut=2000, ease_in=3000, ease_out=1000, pad_before=5000, pad_after=5000, pad_between=10000):
|
| 329 |
token_lists = list(map(tokenize, segments['Tokens']))
|
| 330 |
wavs = []
|
| 331 |
opening_cut = int(opening_cut / speed)
|
|
|
|
| 342 |
raise gr.Error(e)
|
| 343 |
break
|
| 344 |
for out in outs:
|
|
|
|
|
|
|
| 345 |
if opening_cut > 0:
|
| 346 |
out = out[opening_cut:]
|
| 347 |
if closing_cut > 0:
|
|
|
|
| 398 |
with gr.Column():
|
| 399 |
audio = gr.Audio(interactive=False, label='Output Audio')
|
| 400 |
with gr.Accordion('Audio Settings', open=False):
|
|
|
|
|
|
|
| 401 |
with gr.Row():
|
| 402 |
speed = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.1, label='Speed', info='⚡️ Adjust the speed of the audio. The settings below are auto-scaled by speed.')
|
| 403 |
with gr.Row():
|
|
|
|
| 421 |
segments = gr.Dataframe(headers=['#', 'Text', 'Tokens', 'Length'], row_count=(1, 'dynamic'), col_count=(4, 'fixed'), label='Segments', interactive=False, wrap=True)
|
| 422 |
segments.change(fn=did_change_segments, inputs=[segments], outputs=[segment_btn, generate_btn])
|
| 423 |
segment_btn.click(segment_and_tokenize, inputs=[text, voice, skip_square_brackets, newline_split], outputs=[segments])
|
| 424 |
+
generate_btn.click(lf_generate, inputs=[segments, voice, speed, opening_cut, closing_cut, ease_in, ease_out, pad_before, pad_after, pad_between], outputs=[audio])
|
| 425 |
|
| 426 |
with gr.Blocks() as about:
|
| 427 |
gr.Markdown("""
|
|
|
|
| 434 |
Vast was chosen over other compute providers due to its competitive on-demand hourly rates.<br/>
|
| 435 |
The average hourly cost for the 1x A100-class 80GB VRAM instances used for training was below $1/hr — around half the quoted rates from other providers.
|
| 436 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
### Licenses
|
| 438 |
Inference code: MIT<br/>
|
| 439 |
espeak-ng dependency: GPL-3.0<sup>[4]</sup><br/>
|
|
|
|
| 447 |
4. eSpeak NG | https://github.com/espeak-ng/espeak-ng
|
| 448 |
5. Quotable Data | https://github.com/quotable-io/data/blob/master/data/quotes.json
|
| 449 |
6. Common Voice Japanese sentences | https://github.com/common-voice/common-voice/tree/main/server/data/ja
|
| 450 |
+
|
| 451 |
+
### Contact
|
| 452 |
+
@rzvzn on Discord
|
| 453 |
""")
|
| 454 |
|
| 455 |
with gr.Blocks() as api_info:
|
|
|
|
| 478 |
Note that this Space and the underlying Kokoro model are both under development and subject to change. Reliability is not guaranteed. Hugging Face and/or Gradio might enforce their own rate limits.
|
| 479 |
""")
|
| 480 |
|
| 481 |
+
with gr.Blocks() as version_info:
|
| 482 |
+
gr.Markdown("""
|
| 483 |
+
| Model Version | Date | Validation losses (mel/dur/f0) |
|
| 484 |
+
| ------- | ---- | ------------------------------ |
|
| 485 |
+
| v0.19 | 2024 Nov 22 | 0.261 / 0.627 / 1.897 |
|
| 486 |
+
| v0.16 | 2024 Nov 15 | 0.263 / 0.646 / 1.934 |
|
| 487 |
+
| v0.14 | 2024 Nov 12 | 0.262 / 0.642 / 1.889 |
|
| 488 |
+
""")
|
| 489 |
+
|
| 490 |
with gr.Blocks() as app:
|
| 491 |
gr.TabbedInterface(
|
| 492 |
+
[basic_tts, lf_tts, about, api_info, version_info],
|
| 493 |
+
['🗣️ Basic TTS', '📖 Long-Form', 'ℹ️ About', '🚀 Gradio API', '📝 Version History'],
|
| 494 |
)
|
| 495 |
|
| 496 |
if __name__ == '__main__':
|
requirements.txt
CHANGED
|
@@ -2,7 +2,6 @@ fugashi
|
|
| 2 |
gradio
|
| 3 |
mojimoji
|
| 4 |
munch
|
| 5 |
-
noisereduce
|
| 6 |
phonemizer
|
| 7 |
pypdf
|
| 8 |
scipy
|
|
|
|
| 2 |
gradio
|
| 3 |
mojimoji
|
| 4 |
munch
|
|
|
|
| 5 |
phonemizer
|
| 6 |
pypdf
|
| 7 |
scipy
|