mrq
commited on
Commit
·
33f0c95
1
Parent(s):
533ff16
- app.py +30 -2
- requirements.txt +1 -1
app.py
CHANGED
|
@@ -42,6 +42,7 @@ if USING_SPACES:
|
|
| 42 |
from vall_e.emb.qnt import decode_to_wave
|
| 43 |
from vall_e.data import get_lang_symmap, get_random_prompt
|
| 44 |
from vall_e.models.arch import AVAILABLE_ATTENTIONS
|
|
|
|
| 45 |
else:
|
| 46 |
from .inference import TTS, cfg
|
| 47 |
from .train import train
|
|
@@ -50,6 +51,8 @@ else:
|
|
| 50 |
from .emb.qnt import decode_to_wave
|
| 51 |
from .data import get_lang_symmap, get_random_prompt
|
| 52 |
from .models.arch import AVAILABLE_ATTENTIONS
|
|
|
|
|
|
|
| 53 |
|
| 54 |
is_windows = sys.platform.startswith("win")
|
| 55 |
|
|
@@ -144,6 +147,11 @@ def load_sample( speaker ):
|
|
| 144 |
|
| 145 |
return data, (sr, wav)
|
| 146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
def init_tts(config=None, lora=None, restart=False, device="cuda", dtype="auto", attention=None):
|
| 148 |
global tts
|
| 149 |
|
|
@@ -203,6 +211,7 @@ def do_inference_tts( progress=gr.Progress(track_tqdm=True), *args, **kwargs ):
|
|
| 203 |
parser.add_argument("--task", type=str, default="tts")
|
| 204 |
parser.add_argument("--modality", type=str, default=kwargs["modality"])
|
| 205 |
parser.add_argument("--references", type=str, default=kwargs["reference"])
|
|
|
|
| 206 |
parser.add_argument("--language", type=str, default=kwargs["language"])
|
| 207 |
parser.add_argument("--text-language", type=str, default=kwargs["text-language"])
|
| 208 |
parser.add_argument("--split-text-by", type=str, default=kwargs["split-text-by"])
|
|
@@ -275,6 +284,7 @@ def do_inference_tts( progress=gr.Progress(track_tqdm=True), *args, **kwargs ):
|
|
| 275 |
sampling_kwargs = dict(
|
| 276 |
split_text_by=args.split_text_by,
|
| 277 |
context_history=args.context_history,
|
|
|
|
| 278 |
max_steps=args.max_steps,
|
| 279 |
max_levels=args.max_levels,
|
| 280 |
max_duration=args.max_duration,
|
|
@@ -391,6 +401,7 @@ def do_inference_stt( progress=gr.Progress(track_tqdm=True), *args, **kwargs ):
|
|
| 391 |
"""
|
| 392 |
@gradio_wrapper(inputs=layout["training"]["inputs"].keys())
|
| 393 |
def do_training( progress=gr.Progress(track_tqdm=True), *args, **kwargs ):
|
|
|
|
| 394 |
while True:
|
| 395 |
metrics = next(it)
|
| 396 |
yield metrics
|
|
@@ -430,10 +441,13 @@ with ui:
|
|
| 430 |
with gr.Tab("Text-to-Speech"):
|
| 431 |
with gr.Row():
|
| 432 |
with gr.Column(scale=8):
|
| 433 |
-
|
|
|
|
|
|
|
|
|
|
| 434 |
with gr.Row():
|
| 435 |
with gr.Column(scale=1):
|
| 436 |
-
layout["inference_tts"]["inputs"]["reference"] = gr.Audio(label="Audio Input", sources=["upload"], type="filepath")
|
| 437 |
# layout["inference_tts"]["stop"] = gr.Button(value="Stop")
|
| 438 |
layout["inference_tts"]["outputs"]["output"] = gr.Audio(label="Output")
|
| 439 |
layout["inference_tts"]["buttons"]["inference"] = gr.Button(value="Inference")
|
|
@@ -496,6 +510,20 @@ with ui:
|
|
| 496 |
outputs=[ x for x in layout["inference_tts"]["outputs"].values() if x is not None]
|
| 497 |
)
|
| 498 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 499 |
with gr.Tab("Speech to Text"):
|
| 500 |
with gr.Row():
|
| 501 |
with gr.Column(scale=8):
|
|
|
|
| 42 |
from vall_e.emb.qnt import decode_to_wave
|
| 43 |
from vall_e.data import get_lang_symmap, get_random_prompt
|
| 44 |
from vall_e.models.arch import AVAILABLE_ATTENTIONS
|
| 45 |
+
from vall_e.emb.transcribe import transcribe
|
| 46 |
else:
|
| 47 |
from .inference import TTS, cfg
|
| 48 |
from .train import train
|
|
|
|
| 51 |
from .emb.qnt import decode_to_wave
|
| 52 |
from .data import get_lang_symmap, get_random_prompt
|
| 53 |
from .models.arch import AVAILABLE_ATTENTIONS
|
| 54 |
+
from .emb.transcribe import transcribe
|
| 55 |
+
|
| 56 |
|
| 57 |
is_windows = sys.platform.startswith("win")
|
| 58 |
|
|
|
|
| 147 |
|
| 148 |
return data, (sr, wav)
|
| 149 |
|
| 150 |
+
def gradio_transcribe_input( audio, text, split_by ):
|
| 151 |
+
if not audio:
|
| 152 |
+
return ( text, split_by )
|
| 153 |
+
return ( transcribe( audio, model_name="openai/whisper-base", align=False )["text"], "lines" )
|
| 154 |
+
|
| 155 |
def init_tts(config=None, lora=None, restart=False, device="cuda", dtype="auto", attention=None):
|
| 156 |
global tts
|
| 157 |
|
|
|
|
| 211 |
parser.add_argument("--task", type=str, default="tts")
|
| 212 |
parser.add_argument("--modality", type=str, default=kwargs["modality"])
|
| 213 |
parser.add_argument("--references", type=str, default=kwargs["reference"])
|
| 214 |
+
parser.add_argument("--voice-convert", type=str, default=kwargs["voice-convert"])
|
| 215 |
parser.add_argument("--language", type=str, default=kwargs["language"])
|
| 216 |
parser.add_argument("--text-language", type=str, default=kwargs["text-language"])
|
| 217 |
parser.add_argument("--split-text-by", type=str, default=kwargs["split-text-by"])
|
|
|
|
| 284 |
sampling_kwargs = dict(
|
| 285 |
split_text_by=args.split_text_by,
|
| 286 |
context_history=args.context_history,
|
| 287 |
+
voice_convert=args.voice_convert,
|
| 288 |
max_steps=args.max_steps,
|
| 289 |
max_levels=args.max_levels,
|
| 290 |
max_duration=args.max_duration,
|
|
|
|
| 401 |
"""
|
| 402 |
@gradio_wrapper(inputs=layout["training"]["inputs"].keys())
|
| 403 |
def do_training( progress=gr.Progress(track_tqdm=True), *args, **kwargs ):
|
| 404 |
+
|
| 405 |
while True:
|
| 406 |
metrics = next(it)
|
| 407 |
yield metrics
|
|
|
|
| 441 |
with gr.Tab("Text-to-Speech"):
|
| 442 |
with gr.Row():
|
| 443 |
with gr.Column(scale=8):
|
| 444 |
+
with gr.Tab("Text"):
|
| 445 |
+
layout["inference_tts"]["inputs"]["text"] = gr.Textbox(lines=5, value=get_random_prompt, label="Input Prompt")
|
| 446 |
+
with gr.Tab("Speech"):
|
| 447 |
+
layout["inference_tts"]["inputs"]["voice-convert"] = gr.Audio(label="Audio Input", sources=["upload"], type="filepath") # , info="Guiding utternace.")
|
| 448 |
with gr.Row():
|
| 449 |
with gr.Column(scale=1):
|
| 450 |
+
layout["inference_tts"]["inputs"]["reference"] = gr.Audio(label="Audio Input", sources=["upload"], type="filepath") # , info="Reference audio for TTS")
|
| 451 |
# layout["inference_tts"]["stop"] = gr.Button(value="Stop")
|
| 452 |
layout["inference_tts"]["outputs"]["output"] = gr.Audio(label="Output")
|
| 453 |
layout["inference_tts"]["buttons"]["inference"] = gr.Button(value="Inference")
|
|
|
|
| 510 |
outputs=[ x for x in layout["inference_tts"]["outputs"].values() if x is not None]
|
| 511 |
)
|
| 512 |
|
| 513 |
+
# IC
|
| 514 |
+
layout["inference_tts"]["inputs"]["voice-convert"].change(
|
| 515 |
+
gradio_transcribe_input,
|
| 516 |
+
[
|
| 517 |
+
layout["inference_tts"]["inputs"]["voice-convert"],
|
| 518 |
+
layout["inference_tts"]["inputs"]["text"],
|
| 519 |
+
layout["inference_tts"]["inputs"]["split-text-by"],
|
| 520 |
+
],
|
| 521 |
+
[
|
| 522 |
+
layout["inference_tts"]["inputs"]["text"],
|
| 523 |
+
layout["inference_tts"]["inputs"]["split-text-by"],
|
| 524 |
+
]
|
| 525 |
+
)
|
| 526 |
+
|
| 527 |
with gr.Tab("Speech to Text"):
|
| 528 |
with gr.Row():
|
| 529 |
with gr.Column(scale=8):
|
requirements.txt
CHANGED
|
@@ -4,4 +4,4 @@ torchaudio
|
|
| 4 |
sageattention==1.0.6
|
| 5 |
pykakasi
|
| 6 |
|
| 7 |
-
vall_e @ git+https://github.com/e-c-k-e-r/vall-e.git@
|
|
|
|
| 4 |
sageattention==1.0.6
|
| 5 |
pykakasi
|
| 6 |
|
| 7 |
+
vall_e @ git+https://github.com/e-c-k-e-r/vall-e.git@c2e17e287bcceaa655752c635c92efa823c0eeac
|