mrq
commited on
Commit
·
33f0c95
1
Parent(s):
533ff16
- app.py +30 -2
- requirements.txt +1 -1
app.py
CHANGED
@@ -42,6 +42,7 @@ if USING_SPACES:
|
|
42 |
from vall_e.emb.qnt import decode_to_wave
|
43 |
from vall_e.data import get_lang_symmap, get_random_prompt
|
44 |
from vall_e.models.arch import AVAILABLE_ATTENTIONS
|
|
|
45 |
else:
|
46 |
from .inference import TTS, cfg
|
47 |
from .train import train
|
@@ -50,6 +51,8 @@ else:
|
|
50 |
from .emb.qnt import decode_to_wave
|
51 |
from .data import get_lang_symmap, get_random_prompt
|
52 |
from .models.arch import AVAILABLE_ATTENTIONS
|
|
|
|
|
53 |
|
54 |
is_windows = sys.platform.startswith("win")
|
55 |
|
@@ -144,6 +147,11 @@ def load_sample( speaker ):
|
|
144 |
|
145 |
return data, (sr, wav)
|
146 |
|
|
|
|
|
|
|
|
|
|
|
147 |
def init_tts(config=None, lora=None, restart=False, device="cuda", dtype="auto", attention=None):
|
148 |
global tts
|
149 |
|
@@ -203,6 +211,7 @@ def do_inference_tts( progress=gr.Progress(track_tqdm=True), *args, **kwargs ):
|
|
203 |
parser.add_argument("--task", type=str, default="tts")
|
204 |
parser.add_argument("--modality", type=str, default=kwargs["modality"])
|
205 |
parser.add_argument("--references", type=str, default=kwargs["reference"])
|
|
|
206 |
parser.add_argument("--language", type=str, default=kwargs["language"])
|
207 |
parser.add_argument("--text-language", type=str, default=kwargs["text-language"])
|
208 |
parser.add_argument("--split-text-by", type=str, default=kwargs["split-text-by"])
|
@@ -275,6 +284,7 @@ def do_inference_tts( progress=gr.Progress(track_tqdm=True), *args, **kwargs ):
|
|
275 |
sampling_kwargs = dict(
|
276 |
split_text_by=args.split_text_by,
|
277 |
context_history=args.context_history,
|
|
|
278 |
max_steps=args.max_steps,
|
279 |
max_levels=args.max_levels,
|
280 |
max_duration=args.max_duration,
|
@@ -391,6 +401,7 @@ def do_inference_stt( progress=gr.Progress(track_tqdm=True), *args, **kwargs ):
|
|
391 |
"""
|
392 |
@gradio_wrapper(inputs=layout["training"]["inputs"].keys())
|
393 |
def do_training( progress=gr.Progress(track_tqdm=True), *args, **kwargs ):
|
|
|
394 |
while True:
|
395 |
metrics = next(it)
|
396 |
yield metrics
|
@@ -430,10 +441,13 @@ with ui:
|
|
430 |
with gr.Tab("Text-to-Speech"):
|
431 |
with gr.Row():
|
432 |
with gr.Column(scale=8):
|
433 |
-
|
|
|
|
|
|
|
434 |
with gr.Row():
|
435 |
with gr.Column(scale=1):
|
436 |
-
layout["inference_tts"]["inputs"]["reference"] = gr.Audio(label="Audio Input", sources=["upload"], type="filepath")
|
437 |
# layout["inference_tts"]["stop"] = gr.Button(value="Stop")
|
438 |
layout["inference_tts"]["outputs"]["output"] = gr.Audio(label="Output")
|
439 |
layout["inference_tts"]["buttons"]["inference"] = gr.Button(value="Inference")
|
@@ -496,6 +510,20 @@ with ui:
|
|
496 |
outputs=[ x for x in layout["inference_tts"]["outputs"].values() if x is not None]
|
497 |
)
|
498 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
499 |
with gr.Tab("Speech to Text"):
|
500 |
with gr.Row():
|
501 |
with gr.Column(scale=8):
|
|
|
42 |
from vall_e.emb.qnt import decode_to_wave
|
43 |
from vall_e.data import get_lang_symmap, get_random_prompt
|
44 |
from vall_e.models.arch import AVAILABLE_ATTENTIONS
|
45 |
+
from vall_e.emb.transcribe import transcribe
|
46 |
else:
|
47 |
from .inference import TTS, cfg
|
48 |
from .train import train
|
|
|
51 |
from .emb.qnt import decode_to_wave
|
52 |
from .data import get_lang_symmap, get_random_prompt
|
53 |
from .models.arch import AVAILABLE_ATTENTIONS
|
54 |
+
from .emb.transcribe import transcribe
|
55 |
+
|
56 |
|
57 |
is_windows = sys.platform.startswith("win")
|
58 |
|
|
|
147 |
|
148 |
return data, (sr, wav)
|
149 |
|
150 |
+
def gradio_transcribe_input( audio, text, split_by ):
|
151 |
+
if not audio:
|
152 |
+
return ( text, split_by )
|
153 |
+
return ( transcribe( audio, model_name="openai/whisper-base", align=False )["text"], "lines" )
|
154 |
+
|
155 |
def init_tts(config=None, lora=None, restart=False, device="cuda", dtype="auto", attention=None):
|
156 |
global tts
|
157 |
|
|
|
211 |
parser.add_argument("--task", type=str, default="tts")
|
212 |
parser.add_argument("--modality", type=str, default=kwargs["modality"])
|
213 |
parser.add_argument("--references", type=str, default=kwargs["reference"])
|
214 |
+
parser.add_argument("--voice-convert", type=str, default=kwargs["voice-convert"])
|
215 |
parser.add_argument("--language", type=str, default=kwargs["language"])
|
216 |
parser.add_argument("--text-language", type=str, default=kwargs["text-language"])
|
217 |
parser.add_argument("--split-text-by", type=str, default=kwargs["split-text-by"])
|
|
|
284 |
sampling_kwargs = dict(
|
285 |
split_text_by=args.split_text_by,
|
286 |
context_history=args.context_history,
|
287 |
+
voice_convert=args.voice_convert,
|
288 |
max_steps=args.max_steps,
|
289 |
max_levels=args.max_levels,
|
290 |
max_duration=args.max_duration,
|
|
|
401 |
"""
|
402 |
@gradio_wrapper(inputs=layout["training"]["inputs"].keys())
|
403 |
def do_training( progress=gr.Progress(track_tqdm=True), *args, **kwargs ):
|
404 |
+
|
405 |
while True:
|
406 |
metrics = next(it)
|
407 |
yield metrics
|
|
|
441 |
with gr.Tab("Text-to-Speech"):
|
442 |
with gr.Row():
|
443 |
with gr.Column(scale=8):
|
444 |
+
with gr.Tab("Text"):
|
445 |
+
layout["inference_tts"]["inputs"]["text"] = gr.Textbox(lines=5, value=get_random_prompt, label="Input Prompt")
|
446 |
+
with gr.Tab("Speech"):
|
447 |
+
layout["inference_tts"]["inputs"]["voice-convert"] = gr.Audio(label="Audio Input", sources=["upload"], type="filepath") # , info="Guiding utternace.")
|
448 |
with gr.Row():
|
449 |
with gr.Column(scale=1):
|
450 |
+
layout["inference_tts"]["inputs"]["reference"] = gr.Audio(label="Audio Input", sources=["upload"], type="filepath") # , info="Reference audio for TTS")
|
451 |
# layout["inference_tts"]["stop"] = gr.Button(value="Stop")
|
452 |
layout["inference_tts"]["outputs"]["output"] = gr.Audio(label="Output")
|
453 |
layout["inference_tts"]["buttons"]["inference"] = gr.Button(value="Inference")
|
|
|
510 |
outputs=[ x for x in layout["inference_tts"]["outputs"].values() if x is not None]
|
511 |
)
|
512 |
|
513 |
+
# IC
|
514 |
+
layout["inference_tts"]["inputs"]["voice-convert"].change(
|
515 |
+
gradio_transcribe_input,
|
516 |
+
[
|
517 |
+
layout["inference_tts"]["inputs"]["voice-convert"],
|
518 |
+
layout["inference_tts"]["inputs"]["text"],
|
519 |
+
layout["inference_tts"]["inputs"]["split-text-by"],
|
520 |
+
],
|
521 |
+
[
|
522 |
+
layout["inference_tts"]["inputs"]["text"],
|
523 |
+
layout["inference_tts"]["inputs"]["split-text-by"],
|
524 |
+
]
|
525 |
+
)
|
526 |
+
|
527 |
with gr.Tab("Speech to Text"):
|
528 |
with gr.Row():
|
529 |
with gr.Column(scale=8):
|
requirements.txt
CHANGED
@@ -4,4 +4,4 @@ torchaudio
|
|
4 |
sageattention==1.0.6
|
5 |
pykakasi
|
6 |
|
7 |
-
vall_e @ git+https://github.com/e-c-k-e-r/vall-e.git@
|
|
|
4 |
sageattention==1.0.6
|
5 |
pykakasi
|
6 |
|
7 |
+
vall_e @ git+https://github.com/e-c-k-e-r/vall-e.git@c2e17e287bcceaa655752c635c92efa823c0eeac
|