Spaces:
Sleeping
Sleeping
NTT123
commited on
Commit
·
3dbfd73
1
Parent(s):
2157b01
Update tacotron model that uses phonemes instead of raw text.
Browse files- .gitattributes +1 -6
- .gitignore +1 -0
- alphabet.txt +26 -10
- app.py +5 -1
- inference.py +9 -0
- install_espeak_ng.sh +10 -0
- packages.txt +6 -0
- requirements.txt +2 -1
- tacotron.py +6 -2
- tacotron.toml +1 -0
- pretrained_model_ljs_600k.ckpt → tacotrons_ljs_24k_v1_0250000.ckpt +2 -2
.gitattributes
CHANGED
|
@@ -27,9 +27,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
bazelisk-linux-amd64 filter=lfs diff=lfs merge=lfs -text
|
| 29 |
wavegru_mod.so filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
|
| 31 |
-
wavegru_vocoder_1024_v3_1310000.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
wavegru_vocoder_1024_v3_1330000.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
wavegru_vocoder_1024_v3_1340000.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
wavegru_vocoder_1024_v3_1360000.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
wavegru_vocoder_1024_v3_1400000.ckpt filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
bazelisk-linux-amd64 filter=lfs diff=lfs merge=lfs -text
|
| 29 |
wavegru_mod.so filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
.venv
|
alphabet.txt
CHANGED
|
@@ -1,25 +1,18 @@
|
|
| 1 |
_
|
|
|
|
| 2 |
|
| 3 |
!
|
| 4 |
"
|
| 5 |
-
'
|
| 6 |
-
(
|
| 7 |
-
)
|
| 8 |
,
|
| 9 |
-
-
|
| 10 |
.
|
| 11 |
:
|
| 12 |
;
|
| 13 |
?
|
| 14 |
-
[
|
| 15 |
-
]
|
| 16 |
a
|
| 17 |
b
|
| 18 |
-
c
|
| 19 |
d
|
| 20 |
e
|
| 21 |
f
|
| 22 |
-
g
|
| 23 |
h
|
| 24 |
i
|
| 25 |
j
|
|
@@ -29,7 +22,6 @@ m
|
|
| 29 |
n
|
| 30 |
o
|
| 31 |
p
|
| 32 |
-
q
|
| 33 |
r
|
| 34 |
s
|
| 35 |
t
|
|
@@ -37,5 +29,29 @@ u
|
|
| 37 |
v
|
| 38 |
w
|
| 39 |
x
|
| 40 |
-
y
|
| 41 |
z
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
_
|
| 2 |
+
■
|
| 3 |
|
| 4 |
!
|
| 5 |
"
|
|
|
|
|
|
|
|
|
|
| 6 |
,
|
|
|
|
| 7 |
.
|
| 8 |
:
|
| 9 |
;
|
| 10 |
?
|
|
|
|
|
|
|
| 11 |
a
|
| 12 |
b
|
|
|
|
| 13 |
d
|
| 14 |
e
|
| 15 |
f
|
|
|
|
| 16 |
h
|
| 17 |
i
|
| 18 |
j
|
|
|
|
| 22 |
n
|
| 23 |
o
|
| 24 |
p
|
|
|
|
| 25 |
r
|
| 26 |
s
|
| 27 |
t
|
|
|
|
| 29 |
v
|
| 30 |
w
|
| 31 |
x
|
|
|
|
| 32 |
z
|
| 33 |
+
æ
|
| 34 |
+
ð
|
| 35 |
+
ŋ
|
| 36 |
+
ɐ
|
| 37 |
+
ɑ
|
| 38 |
+
ɔ
|
| 39 |
+
ə
|
| 40 |
+
ɚ
|
| 41 |
+
ɛ
|
| 42 |
+
ɜ
|
| 43 |
+
ɡ
|
| 44 |
+
ɪ
|
| 45 |
+
ɹ
|
| 46 |
+
ɾ
|
| 47 |
+
ʃ
|
| 48 |
+
ʊ
|
| 49 |
+
ʌ
|
| 50 |
+
ʒ
|
| 51 |
+
ʔ
|
| 52 |
+
ˈ
|
| 53 |
+
ˌ
|
| 54 |
+
ː
|
| 55 |
+
̩
|
| 56 |
+
θ
|
| 57 |
+
ᵻ
|
app.py
CHANGED
|
@@ -3,6 +3,10 @@
|
|
| 3 |
# os.system("./bazelisk-linux-amd64 clean --expunge")
|
| 4 |
# os.system("./bazelisk-linux-amd64 build wavegru_mod -c opt --copt=-march=native")
|
| 5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
import gradio as gr
|
| 8 |
from inference import load_tacotron_model, load_wavegru_net, mel_to_wav, text_to_mel
|
|
@@ -11,7 +15,7 @@ from wavegru_cpp import extract_weight_mask, load_wavegru_cpp
|
|
| 11 |
|
| 12 |
def speak(text):
|
| 13 |
alphabet, tacotron_net, tacotron_config = load_tacotron_model(
|
| 14 |
-
"./alphabet.txt", "./tacotron.toml", "./
|
| 15 |
)
|
| 16 |
|
| 17 |
wavegru_config, wavegru_net = load_wavegru_net(
|
|
|
|
| 3 |
# os.system("./bazelisk-linux-amd64 clean --expunge")
|
| 4 |
# os.system("./bazelisk-linux-amd64 build wavegru_mod -c opt --copt=-march=native")
|
| 5 |
|
| 6 |
+
# install espeak
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
os.system("bash ./install_espeak_ng.sh")
|
| 10 |
|
| 11 |
import gradio as gr
|
| 12 |
from inference import load_tacotron_model, load_wavegru_net, mel_to_wav, text_to_mel
|
|
|
|
| 15 |
|
| 16 |
def speak(text):
|
| 17 |
alphabet, tacotron_net, tacotron_config = load_tacotron_model(
|
| 18 |
+
"./alphabet.txt", "./tacotron.toml", "./tacotrons_ljs_24k_v1_0250000.ckpt"
|
| 19 |
)
|
| 20 |
|
| 21 |
wavegru_config, wavegru_net = load_wavegru_net(
|
inference.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
import jax
|
| 2 |
import jax.numpy as jnp
|
| 3 |
import librosa
|
|
@@ -14,6 +16,11 @@ from utils import (
|
|
| 14 |
)
|
| 15 |
from wavegru import WaveGRU
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
def load_tacotron_model(alphabet_file, config_file, model_file):
|
| 19 |
"""load tacotron model to memory"""
|
|
@@ -34,6 +41,8 @@ tacotron_inference_fn = pax.pure(lambda net, text: net.inference(text, max_len=2
|
|
| 34 |
def text_to_mel(net, text, alphabet, config):
|
| 35 |
"""convert text to mel spectrogram"""
|
| 36 |
text = english_cleaners(text)
|
|
|
|
|
|
|
| 37 |
text = text + config["PAD"] * (100 - (len(text) % 100))
|
| 38 |
tokens = []
|
| 39 |
for c in text:
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
import jax
|
| 4 |
import jax.numpy as jnp
|
| 5 |
import librosa
|
|
|
|
| 16 |
)
|
| 17 |
from wavegru import WaveGRU
|
| 18 |
|
| 19 |
+
os.environ["PHONEMIZER_ESPEAK_LIBRARY"] = "./espeak/usr/lib/libespeak-ng.so.1.1.51"
|
| 20 |
+
from phonemizer.backend import EspeakBackend
|
| 21 |
+
|
| 22 |
+
backend = EspeakBackend("en-us", preserve_punctuation=True, with_stress=True)
|
| 23 |
+
|
| 24 |
|
| 25 |
def load_tacotron_model(alphabet_file, config_file, model_file):
|
| 26 |
"""load tacotron model to memory"""
|
|
|
|
| 41 |
def text_to_mel(net, text, alphabet, config):
|
| 42 |
"""convert text to mel spectrogram"""
|
| 43 |
text = english_cleaners(text)
|
| 44 |
+
text = backend.phonemize([text], strip=True)[0]
|
| 45 |
+
text = text + config["END_CHARACTER"]
|
| 46 |
text = text + config["PAD"] * (100 - (len(text) % 100))
|
| 47 |
tokens = []
|
| 48 |
for c in text:
|
install_espeak_ng.sh
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
rm -rf espeak
|
| 2 |
+
mkdir -p espeak
|
| 3 |
+
cd espeak
|
| 4 |
+
wget https://github.com/espeak-ng/espeak-ng/archive/refs/tags/1.51.zip
|
| 5 |
+
unzip -qq 1.51.zip
|
| 6 |
+
cd espeak-ng-1.51
|
| 7 |
+
./autogen.sh
|
| 8 |
+
./configure --prefix=`pwd`/../usr
|
| 9 |
+
make
|
| 10 |
+
make install
|
packages.txt
CHANGED
|
@@ -1 +1,7 @@
|
|
| 1 |
libsndfile1-dev
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
libsndfile1-dev
|
| 2 |
+
make
|
| 3 |
+
autoconf
|
| 4 |
+
automake
|
| 5 |
+
libtool
|
| 6 |
+
pkg-config
|
| 7 |
+
gcc
|
requirements.txt
CHANGED
|
@@ -8,4 +8,5 @@ numpy==1.22.3
|
|
| 8 |
pax3==0.5.6
|
| 9 |
pyyaml==6.0
|
| 10 |
toml==0.10.2
|
| 11 |
-
unidecode==1.3.4
|
|
|
|
|
|
| 8 |
pax3==0.5.6
|
| 9 |
pyyaml==6.0
|
| 10 |
toml==0.10.2
|
| 11 |
+
unidecode==1.3.4
|
| 12 |
+
phonemizer==3.1.1
|
tacotron.py
CHANGED
|
@@ -371,7 +371,10 @@ class Tacotron(pax.Module):
|
|
| 371 |
x = x[:, : self.rr, :]
|
| 372 |
x = jnp.reshape(x, (N, self.rr, -1))
|
| 373 |
mel = x[..., :-1]
|
| 374 |
-
|
|
|
|
|
|
|
|
|
|
| 375 |
return attn_state, decoder_rnn_states, rng_key, (mel, eos)
|
| 376 |
|
| 377 |
def inference(self, text, seed=42, max_len=1000):
|
|
@@ -381,6 +384,7 @@ class Tacotron(pax.Module):
|
|
| 381 |
text = self.encode_text(text)
|
| 382 |
text_key = self.text_key_fc(text)
|
| 383 |
N, L, D = text.shape
|
|
|
|
| 384 |
mel = self.go_frame(N)
|
| 385 |
|
| 386 |
attn_state, decoder_rnn_states = self.decoder_initial_state(N, L)
|
|
@@ -393,7 +397,7 @@ class Tacotron(pax.Module):
|
|
| 393 |
attn_state, decoder_rnn_states, rng_key, mel, text, text_key
|
| 394 |
)
|
| 395 |
mels.append(mel)
|
| 396 |
-
if eos
|
| 397 |
break
|
| 398 |
|
| 399 |
mel = mel[:, -1, :]
|
|
|
|
| 371 |
x = x[:, : self.rr, :]
|
| 372 |
x = jnp.reshape(x, (N, self.rr, -1))
|
| 373 |
mel = x[..., :-1]
|
| 374 |
+
eos_logit = x[..., -1]
|
| 375 |
+
eos_pr = jax.nn.sigmoid(eos_logit[0, -1])
|
| 376 |
+
rng_key, eos_rng_key = jax.random.split(rng_key)
|
| 377 |
+
eos = jax.random.bernoulli(eos_rng_key, p=eos_pr)
|
| 378 |
return attn_state, decoder_rnn_states, rng_key, (mel, eos)
|
| 379 |
|
| 380 |
def inference(self, text, seed=42, max_len=1000):
|
|
|
|
| 384 |
text = self.encode_text(text)
|
| 385 |
text_key = self.text_key_fc(text)
|
| 386 |
N, L, D = text.shape
|
| 387 |
+
assert N == 1
|
| 388 |
mel = self.go_frame(N)
|
| 389 |
|
| 390 |
attn_state, decoder_rnn_states = self.decoder_initial_state(N, L)
|
|
|
|
| 397 |
attn_state, decoder_rnn_states, rng_key, mel, text, text_key
|
| 398 |
)
|
| 399 |
mels.append(mel)
|
| 400 |
+
if eos.item() or count > max_len:
|
| 401 |
break
|
| 402 |
|
| 403 |
mel = mel[:, -1, :]
|
tacotron.toml
CHANGED
|
@@ -16,6 +16,7 @@ MEL_DIM = 80 # the dimension of melspectrogram features
|
|
| 16 |
MEL_MIN = 1e-5
|
| 17 |
PAD = "_" # padding character
|
| 18 |
PAD_TOKEN = 0
|
|
|
|
| 19 |
TEST_DATA_SIZE = 1024
|
| 20 |
|
| 21 |
# model
|
|
|
|
| 16 |
MEL_MIN = 1e-5
|
| 17 |
PAD = "_" # padding character
|
| 18 |
PAD_TOKEN = 0
|
| 19 |
+
END_CHARACTER = "■" # to signal the end of the transcript
|
| 20 |
TEST_DATA_SIZE = 1024
|
| 21 |
|
| 22 |
# model
|
pretrained_model_ljs_600k.ckpt → tacotrons_ljs_24k_v1_0250000.ckpt
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:512b3af6ef95ccc53d3516256abae81b025e110fa886ec68f9f7033039013fc6
|
| 3 |
+
size 53561547
|