Spaces:
Running
Running
zhzluke96
commited on
Commit
·
d5b3cd8
1
Parent(s):
84cfd61
update
Browse files- modules/SynthesizeSegments.py +8 -8
- modules/api/impl/ssml_api.py +9 -4
- modules/synthesize_audio.py +27 -19
- modules/utils/audio.py +21 -3
- webui.py +16 -32
modules/SynthesizeSegments.py
CHANGED
|
@@ -9,6 +9,7 @@ from modules.normalization import text_normalize
|
|
| 9 |
import logging
|
| 10 |
import json
|
| 11 |
import random
|
|
|
|
| 12 |
|
| 13 |
from modules.speaker import Speaker
|
| 14 |
|
|
@@ -61,6 +62,9 @@ class SynthesizeSegments:
|
|
| 61 |
self.batch_size = batch_size
|
| 62 |
|
| 63 |
def segment_to_generate_params(self, segment: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
|
|
|
|
|
|
|
| 64 |
text = segment.get("text", "")
|
| 65 |
is_end = segment.get("is_end", False)
|
| 66 |
|
|
@@ -111,19 +115,15 @@ class SynthesizeSegments:
|
|
| 111 |
for segment in segments:
|
| 112 |
params = self.segment_to_generate_params(segment)
|
| 113 |
|
| 114 |
-
key_params = params
|
| 115 |
if isinstance(key_params.get("spk"), Speaker):
|
| 116 |
key_params["spk"] = str(key_params["spk"].id)
|
| 117 |
key = json.dumps(
|
| 118 |
{k: v for k, v in key_params.items() if k != "text"}, sort_keys=True
|
| 119 |
)
|
| 120 |
-
if
|
| 121 |
-
key =
|
| 122 |
-
|
| 123 |
-
else:
|
| 124 |
-
if key not in buckets:
|
| 125 |
-
buckets[key] = []
|
| 126 |
-
buckets[key].append(segment)
|
| 127 |
|
| 128 |
# Convert dictionary to list of buckets
|
| 129 |
bucket_list = list(buckets.values())
|
|
|
|
| 9 |
import logging
|
| 10 |
import json
|
| 11 |
import random
|
| 12 |
+
import copy
|
| 13 |
|
| 14 |
from modules.speaker import Speaker
|
| 15 |
|
|
|
|
| 62 |
self.batch_size = batch_size
|
| 63 |
|
| 64 |
def segment_to_generate_params(self, segment: Dict[str, Any]) -> Dict[str, Any]:
|
| 65 |
+
if segment.get("params", None) is not None:
|
| 66 |
+
return segment["params"]
|
| 67 |
+
|
| 68 |
text = segment.get("text", "")
|
| 69 |
is_end = segment.get("is_end", False)
|
| 70 |
|
|
|
|
| 115 |
for segment in segments:
|
| 116 |
params = self.segment_to_generate_params(segment)
|
| 117 |
|
| 118 |
+
key_params = copy.copy(params)
|
| 119 |
if isinstance(key_params.get("spk"), Speaker):
|
| 120 |
key_params["spk"] = str(key_params["spk"].id)
|
| 121 |
key = json.dumps(
|
| 122 |
{k: v for k, v in key_params.items() if k != "text"}, sort_keys=True
|
| 123 |
)
|
| 124 |
+
if key not in buckets:
|
| 125 |
+
buckets[key] = []
|
| 126 |
+
buckets[key].append(segment)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
# Convert dictionary to list of buckets
|
| 129 |
bucket_list = list(buckets.values())
|
modules/api/impl/ssml_api.py
CHANGED
|
@@ -23,7 +23,7 @@ from modules.api.Api import APIManager
|
|
| 23 |
class SSMLRequest(BaseModel):
|
| 24 |
ssml: str
|
| 25 |
format: str = "mp3"
|
| 26 |
-
|
| 27 |
|
| 28 |
|
| 29 |
async def synthesize_ssml(
|
|
@@ -34,7 +34,12 @@ async def synthesize_ssml(
|
|
| 34 |
try:
|
| 35 |
ssml = request.ssml
|
| 36 |
format = request.format
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
if not ssml:
|
| 40 |
raise HTTPException(status_code=400, detail="SSML content is required.")
|
|
@@ -43,8 +48,8 @@ async def synthesize_ssml(
|
|
| 43 |
for seg in segments:
|
| 44 |
seg["text"] = text_normalize(seg["text"], is_end=True)
|
| 45 |
|
| 46 |
-
if
|
| 47 |
-
synthesize = SynthesizeSegments(
|
| 48 |
audio_segments = synthesize.synthesize_segments(segments)
|
| 49 |
combined_audio = combine_audio_segments(audio_segments)
|
| 50 |
buffer = io.BytesIO()
|
|
|
|
| 23 |
class SSMLRequest(BaseModel):
|
| 24 |
ssml: str
|
| 25 |
format: str = "mp3"
|
| 26 |
+
batch_size: int = 4
|
| 27 |
|
| 28 |
|
| 29 |
async def synthesize_ssml(
|
|
|
|
| 34 |
try:
|
| 35 |
ssml = request.ssml
|
| 36 |
format = request.format
|
| 37 |
+
batch_size = request.batch_size
|
| 38 |
+
|
| 39 |
+
if batch_size < 1:
|
| 40 |
+
raise HTTPException(
|
| 41 |
+
status_code=400, detail="Batch size must be greater than 0."
|
| 42 |
+
)
|
| 43 |
|
| 44 |
if not ssml:
|
| 45 |
raise HTTPException(status_code=400, detail="SSML content is required.")
|
|
|
|
| 48 |
for seg in segments:
|
| 49 |
seg["text"] = text_normalize(seg["text"], is_end=True)
|
| 50 |
|
| 51 |
+
if batch_size != 1:
|
| 52 |
+
synthesize = SynthesizeSegments(batch_size)
|
| 53 |
audio_segments = synthesize.synthesize_segments(segments)
|
| 54 |
combined_audio = combine_audio_segments(audio_segments)
|
| 55 |
buffer = io.BytesIO()
|
modules/synthesize_audio.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
|
|
| 1 |
from modules.SentenceSplitter import SentenceSplitter
|
| 2 |
-
from modules.
|
| 3 |
|
| 4 |
from modules import generate_audio as generate
|
| 5 |
|
| 6 |
|
| 7 |
-
import numpy as np
|
| 8 |
-
|
| 9 |
from modules.speaker import Speaker
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
def synthesize_audio(
|
|
@@ -39,20 +39,28 @@ def synthesize_audio(
|
|
| 39 |
else:
|
| 40 |
spliter = SentenceSplitter(spliter_threshold)
|
| 41 |
sentences = spliter.parse(text)
|
| 42 |
-
sentences = [text_normalize(s) for s in sentences]
|
| 43 |
-
audio_data_batch = generate.generate_audio_batch(
|
| 44 |
-
texts=sentences,
|
| 45 |
-
temperature=temperature,
|
| 46 |
-
top_P=top_P,
|
| 47 |
-
top_K=top_K,
|
| 48 |
-
spk=spk,
|
| 49 |
-
infer_seed=infer_seed,
|
| 50 |
-
use_decoder=use_decoder,
|
| 51 |
-
prompt1=prompt1,
|
| 52 |
-
prompt2=prompt2,
|
| 53 |
-
prefix=prefix,
|
| 54 |
-
)
|
| 55 |
-
sample_rate = audio_data_batch[0][0]
|
| 56 |
-
audio_data = np.concatenate([data for _, data in audio_data_batch])
|
| 57 |
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import io
|
| 2 |
from modules.SentenceSplitter import SentenceSplitter
|
| 3 |
+
from modules.SynthesizeSegments import SynthesizeSegments, combine_audio_segments
|
| 4 |
|
| 5 |
from modules import generate_audio as generate
|
| 6 |
|
| 7 |
|
|
|
|
|
|
|
| 8 |
from modules.speaker import Speaker
|
| 9 |
+
from modules.utils import audio
|
| 10 |
|
| 11 |
|
| 12 |
def synthesize_audio(
|
|
|
|
| 39 |
else:
|
| 40 |
spliter = SentenceSplitter(spliter_threshold)
|
| 41 |
sentences = spliter.parse(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
+
text_segments = [
|
| 44 |
+
{
|
| 45 |
+
"text": s,
|
| 46 |
+
"params": {
|
| 47 |
+
"text": s,
|
| 48 |
+
"temperature": temperature,
|
| 49 |
+
"top_P": top_P,
|
| 50 |
+
"top_K": top_K,
|
| 51 |
+
"spk": spk,
|
| 52 |
+
"infer_seed": infer_seed,
|
| 53 |
+
"use_decoder": use_decoder,
|
| 54 |
+
"prompt1": prompt1,
|
| 55 |
+
"prompt2": prompt2,
|
| 56 |
+
"prefix": prefix,
|
| 57 |
+
},
|
| 58 |
+
}
|
| 59 |
+
for s in sentences
|
| 60 |
+
]
|
| 61 |
+
synthesizer = SynthesizeSegments(batch_size)
|
| 62 |
+
audio_segments = synthesizer.synthesize_segments(text_segments)
|
| 63 |
+
|
| 64 |
+
combined_audio = combine_audio_segments(audio_segments)
|
| 65 |
+
|
| 66 |
+
return audio.pydub_to_np(combined_audio)
|
modules/utils/audio.py
CHANGED
|
@@ -9,9 +9,12 @@ INT16_MAX = np.iinfo(np.int16).max
|
|
| 9 |
|
| 10 |
|
| 11 |
def audio_to_int16(audio_data):
|
| 12 |
-
if
|
| 13 |
-
audio_data
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
| 15 |
audio_data = (audio_data * INT16_MAX).astype(np.int16)
|
| 16 |
return audio_data
|
| 17 |
|
|
@@ -27,6 +30,21 @@ def audiosegment_to_librosawav(audiosegment):
|
|
| 27 |
return fp_arr
|
| 28 |
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
def ndarray_to_segment(ndarray, frame_rate):
|
| 31 |
buffer = BytesIO()
|
| 32 |
sf.write(buffer, ndarray, frame_rate, format="wav")
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
def audio_to_int16(audio_data):
|
| 12 |
+
if (
|
| 13 |
+
audio_data.dtype == np.float32
|
| 14 |
+
or audio_data.dtype == np.float64
|
| 15 |
+
or audio_data.dtype == np.float128
|
| 16 |
+
or audio_data.dtype == np.float16
|
| 17 |
+
):
|
| 18 |
audio_data = (audio_data * INT16_MAX).astype(np.int16)
|
| 19 |
return audio_data
|
| 20 |
|
|
|
|
| 30 |
return fp_arr
|
| 31 |
|
| 32 |
|
| 33 |
+
def pydub_to_np(audio: AudioSegment) -> tuple[int, np.ndarray]:
|
| 34 |
+
"""
|
| 35 |
+
Converts pydub audio segment into np.float32 of shape [duration_in_seconds*sample_rate, channels],
|
| 36 |
+
where each value is in range [-1.0, 1.0].
|
| 37 |
+
Returns tuple (audio_np_array, sample_rate).
|
| 38 |
+
"""
|
| 39 |
+
return (
|
| 40 |
+
audio.frame_rate,
|
| 41 |
+
np.array(audio.get_array_of_samples(), dtype=np.float32).reshape(
|
| 42 |
+
(-1, audio.channels)
|
| 43 |
+
)
|
| 44 |
+
/ (1 << (8 * audio.sample_width - 1)),
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
def ndarray_to_segment(ndarray, frame_rate):
|
| 49 |
buffer = BytesIO()
|
| 50 |
sf.write(buffer, ndarray, frame_rate, format="wav")
|
webui.py
CHANGED
|
@@ -16,6 +16,8 @@ import logging
|
|
| 16 |
|
| 17 |
from numpy import clip
|
| 18 |
|
|
|
|
|
|
|
| 19 |
logging.basicConfig(
|
| 20 |
level=os.getenv("LOG_LEVEL", "INFO"),
|
| 21 |
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
@@ -145,8 +147,8 @@ def tts_generate(
|
|
| 145 |
top_k = int(top_k)
|
| 146 |
|
| 147 |
params = calc_spk_style(spk=spk, style=style)
|
| 148 |
-
|
| 149 |
spk = params.get("spk", spk)
|
|
|
|
| 150 |
infer_seed = infer_seed or params.get("seed", infer_seed)
|
| 151 |
temperature = temperature or params.get("temperature", temperature)
|
| 152 |
prefix = prefix or params.get("prefix", prefix)
|
|
@@ -159,37 +161,19 @@ def tts_generate(
|
|
| 159 |
if not disable_normalize:
|
| 160 |
text = text_normalize(text)
|
| 161 |
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
else:
|
| 176 |
-
spliter = SentenceSplitter(webui_config["spliter_threshold"])
|
| 177 |
-
sentences = spliter.parse(text)
|
| 178 |
-
sentences = [text_normalize(s) for s in sentences]
|
| 179 |
-
audio_data_batch = generate_audio_batch(
|
| 180 |
-
texts=sentences,
|
| 181 |
-
temperature=temperature,
|
| 182 |
-
top_P=top_p,
|
| 183 |
-
top_K=top_k,
|
| 184 |
-
spk=spk,
|
| 185 |
-
infer_seed=infer_seed,
|
| 186 |
-
use_decoder=use_decoder,
|
| 187 |
-
prompt1=prompt1,
|
| 188 |
-
prompt2=prompt2,
|
| 189 |
-
prefix=prefix,
|
| 190 |
-
)
|
| 191 |
-
sample_rate = audio_data_batch[0][0]
|
| 192 |
-
audio_data = np.concatenate([data for _, data in audio_data_batch])
|
| 193 |
|
| 194 |
audio_data = audio.audio_to_int16(audio_data)
|
| 195 |
return sample_rate, audio_data
|
|
|
|
| 16 |
|
| 17 |
from numpy import clip
|
| 18 |
|
| 19 |
+
from modules.synthesize_audio import synthesize_audio
|
| 20 |
+
|
| 21 |
logging.basicConfig(
|
| 22 |
level=os.getenv("LOG_LEVEL", "INFO"),
|
| 23 |
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
|
|
| 147 |
top_k = int(top_k)
|
| 148 |
|
| 149 |
params = calc_spk_style(spk=spk, style=style)
|
|
|
|
| 150 |
spk = params.get("spk", spk)
|
| 151 |
+
|
| 152 |
infer_seed = infer_seed or params.get("seed", infer_seed)
|
| 153 |
temperature = temperature or params.get("temperature", temperature)
|
| 154 |
prefix = prefix or params.get("prefix", prefix)
|
|
|
|
| 161 |
if not disable_normalize:
|
| 162 |
text = text_normalize(text)
|
| 163 |
|
| 164 |
+
sample_rate, audio_data = synthesize_audio(
|
| 165 |
+
text=text,
|
| 166 |
+
temperature=temperature,
|
| 167 |
+
top_P=top_p,
|
| 168 |
+
top_K=top_k,
|
| 169 |
+
spk=spk,
|
| 170 |
+
infer_seed=infer_seed,
|
| 171 |
+
use_decoder=use_decoder,
|
| 172 |
+
prompt1=prompt1,
|
| 173 |
+
prompt2=prompt2,
|
| 174 |
+
prefix=prefix,
|
| 175 |
+
batch_size=batch_size,
|
| 176 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
|
| 178 |
audio_data = audio.audio_to_int16(audio_data)
|
| 179 |
return sample_rate, audio_data
|