mrq commited on
Commit
33f0c95
·
1 Parent(s): 533ff16
Files changed (2) hide show
  1. app.py +30 -2
  2. requirements.txt +1 -1
app.py CHANGED
@@ -42,6 +42,7 @@ if USING_SPACES:
42
  from vall_e.emb.qnt import decode_to_wave
43
  from vall_e.data import get_lang_symmap, get_random_prompt
44
  from vall_e.models.arch import AVAILABLE_ATTENTIONS
 
45
  else:
46
  from .inference import TTS, cfg
47
  from .train import train
@@ -50,6 +51,8 @@ else:
50
  from .emb.qnt import decode_to_wave
51
  from .data import get_lang_symmap, get_random_prompt
52
  from .models.arch import AVAILABLE_ATTENTIONS
 
 
53
 
54
  is_windows = sys.platform.startswith("win")
55
 
@@ -144,6 +147,11 @@ def load_sample( speaker ):
144
 
145
  return data, (sr, wav)
146
 
 
 
 
 
 
147
  def init_tts(config=None, lora=None, restart=False, device="cuda", dtype="auto", attention=None):
148
  global tts
149
 
@@ -203,6 +211,7 @@ def do_inference_tts( progress=gr.Progress(track_tqdm=True), *args, **kwargs ):
203
  parser.add_argument("--task", type=str, default="tts")
204
  parser.add_argument("--modality", type=str, default=kwargs["modality"])
205
  parser.add_argument("--references", type=str, default=kwargs["reference"])
 
206
  parser.add_argument("--language", type=str, default=kwargs["language"])
207
  parser.add_argument("--text-language", type=str, default=kwargs["text-language"])
208
  parser.add_argument("--split-text-by", type=str, default=kwargs["split-text-by"])
@@ -275,6 +284,7 @@ def do_inference_tts( progress=gr.Progress(track_tqdm=True), *args, **kwargs ):
275
  sampling_kwargs = dict(
276
  split_text_by=args.split_text_by,
277
  context_history=args.context_history,
 
278
  max_steps=args.max_steps,
279
  max_levels=args.max_levels,
280
  max_duration=args.max_duration,
@@ -391,6 +401,7 @@ def do_inference_stt( progress=gr.Progress(track_tqdm=True), *args, **kwargs ):
391
  """
392
  @gradio_wrapper(inputs=layout["training"]["inputs"].keys())
393
  def do_training( progress=gr.Progress(track_tqdm=True), *args, **kwargs ):
 
394
  while True:
395
  metrics = next(it)
396
  yield metrics
@@ -430,10 +441,13 @@ with ui:
430
  with gr.Tab("Text-to-Speech"):
431
  with gr.Row():
432
  with gr.Column(scale=8):
433
- layout["inference_tts"]["inputs"]["text"] = gr.Textbox(lines=5, value=get_random_prompt, label="Input Prompt")
 
 
 
434
  with gr.Row():
435
  with gr.Column(scale=1):
436
- layout["inference_tts"]["inputs"]["reference"] = gr.Audio(label="Audio Input", sources=["upload"], type="filepath") #, info="Reference audio for TTS")
437
  # layout["inference_tts"]["stop"] = gr.Button(value="Stop")
438
  layout["inference_tts"]["outputs"]["output"] = gr.Audio(label="Output")
439
  layout["inference_tts"]["buttons"]["inference"] = gr.Button(value="Inference")
@@ -496,6 +510,20 @@ with ui:
496
  outputs=[ x for x in layout["inference_tts"]["outputs"].values() if x is not None]
497
  )
498
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
499
  with gr.Tab("Speech to Text"):
500
  with gr.Row():
501
  with gr.Column(scale=8):
 
42
  from vall_e.emb.qnt import decode_to_wave
43
  from vall_e.data import get_lang_symmap, get_random_prompt
44
  from vall_e.models.arch import AVAILABLE_ATTENTIONS
45
+ from vall_e.emb.transcribe import transcribe
46
  else:
47
  from .inference import TTS, cfg
48
  from .train import train
 
51
  from .emb.qnt import decode_to_wave
52
  from .data import get_lang_symmap, get_random_prompt
53
  from .models.arch import AVAILABLE_ATTENTIONS
54
+ from .emb.transcribe import transcribe
55
+
56
 
57
  is_windows = sys.platform.startswith("win")
58
 
 
147
 
148
  return data, (sr, wav)
149
 
150
+ def gradio_transcribe_input( audio, text, split_by ):
151
+ if not audio:
152
+ return ( text, split_by )
153
+ return ( transcribe( audio, model_name="openai/whisper-base", align=False )["text"], "lines" )
154
+
155
  def init_tts(config=None, lora=None, restart=False, device="cuda", dtype="auto", attention=None):
156
  global tts
157
 
 
211
  parser.add_argument("--task", type=str, default="tts")
212
  parser.add_argument("--modality", type=str, default=kwargs["modality"])
213
  parser.add_argument("--references", type=str, default=kwargs["reference"])
214
+ parser.add_argument("--voice-convert", type=str, default=kwargs["voice-convert"])
215
  parser.add_argument("--language", type=str, default=kwargs["language"])
216
  parser.add_argument("--text-language", type=str, default=kwargs["text-language"])
217
  parser.add_argument("--split-text-by", type=str, default=kwargs["split-text-by"])
 
284
  sampling_kwargs = dict(
285
  split_text_by=args.split_text_by,
286
  context_history=args.context_history,
287
+ voice_convert=args.voice_convert,
288
  max_steps=args.max_steps,
289
  max_levels=args.max_levels,
290
  max_duration=args.max_duration,
 
401
  """
402
  @gradio_wrapper(inputs=layout["training"]["inputs"].keys())
403
  def do_training( progress=gr.Progress(track_tqdm=True), *args, **kwargs ):
404
+
405
  while True:
406
  metrics = next(it)
407
  yield metrics
 
441
  with gr.Tab("Text-to-Speech"):
442
  with gr.Row():
443
  with gr.Column(scale=8):
444
+ with gr.Tab("Text"):
445
+ layout["inference_tts"]["inputs"]["text"] = gr.Textbox(lines=5, value=get_random_prompt, label="Input Prompt")
446
+ with gr.Tab("Speech"):
447
+ layout["inference_tts"]["inputs"]["voice-convert"] = gr.Audio(label="Audio Input", sources=["upload"], type="filepath") # , info="Guiding utternace.")
448
  with gr.Row():
449
  with gr.Column(scale=1):
450
+ layout["inference_tts"]["inputs"]["reference"] = gr.Audio(label="Audio Input", sources=["upload"], type="filepath") # , info="Reference audio for TTS")
451
  # layout["inference_tts"]["stop"] = gr.Button(value="Stop")
452
  layout["inference_tts"]["outputs"]["output"] = gr.Audio(label="Output")
453
  layout["inference_tts"]["buttons"]["inference"] = gr.Button(value="Inference")
 
510
  outputs=[ x for x in layout["inference_tts"]["outputs"].values() if x is not None]
511
  )
512
 
513
+ # IC
514
+ layout["inference_tts"]["inputs"]["voice-convert"].change(
515
+ gradio_transcribe_input,
516
+ [
517
+ layout["inference_tts"]["inputs"]["voice-convert"],
518
+ layout["inference_tts"]["inputs"]["text"],
519
+ layout["inference_tts"]["inputs"]["split-text-by"],
520
+ ],
521
+ [
522
+ layout["inference_tts"]["inputs"]["text"],
523
+ layout["inference_tts"]["inputs"]["split-text-by"],
524
+ ]
525
+ )
526
+
527
  with gr.Tab("Speech to Text"):
528
  with gr.Row():
529
  with gr.Column(scale=8):
requirements.txt CHANGED
@@ -4,4 +4,4 @@ torchaudio
4
  sageattention==1.0.6
5
  pykakasi
6
 
7
- vall_e @ git+https://github.com/e-c-k-e-r/vall-e.git@4a65ac9eb7e1879cdab4bbe41c8bc07a0f4388da
 
4
  sageattention==1.0.6
5
  pykakasi
6
 
7
+ vall_e @ git+https://github.com/e-c-k-e-r/vall-e.git@c2e17e287bcceaa655752c635c92efa823c0eeac