naveenk-ai commited on
Commit
7c9c787
·
verified ·
1 Parent(s): da71666

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -407
app.py CHANGED
@@ -1,8 +1,18 @@
1
  import sys
2
  from pathlib import Path
 
 
 
 
 
 
 
 
 
 
3
 
 
4
  repo_dir = Path("OpenVoice")
5
-
6
  if not repo_dir.exists():
7
  !git clone https://github.com/myshell-ai/OpenVoice
8
  orig_english_path = Path("OpenVoice/openvoice/text/_orig_english.py")
@@ -15,23 +25,14 @@ if not repo_dir.exists():
15
  data = data.replace("unidecode", "anyascii")
16
  with english_path.open("w") as out_f:
17
  out_f.write(data)
18
- # append to sys.path so that modules from the repo could be imported
19
  sys.path.append(str(repo_dir))
20
 
 
21
  %pip install -q "librosa>=0.8.1" "wavmark>=0.0.3" "faster-whisper>=0.9.0" "pydub>=0.25.1" "whisper-timestamped>=1.14.2" "tqdm" "inflect>=7.0.0" "eng_to_ipa>=0.0.2" "pypinyin>=0.50.0" \
22
  "cn2an>=0.5.22" "jieba>=0.42.1" "langid>=1.1.6" "gradio>=4.15" "ipywebrtc" "anyascii" "openvino>=2023.3" "torch>=2.1" "nncf>=2.11.0"
23
 
24
- import os
25
- import torch
26
- import openvino as ov
27
- import ipywidgets as widgets
28
- from IPython.display import Audio
29
-
30
  core = ov.Core()
31
 
32
- from openvoice.api import BaseSpeakerTTS, ToneColorConverter, OpenVoiceBaseClass
33
- import openvoice.se_extractor as se_extractor
34
-
35
  CKPT_BASE_PATH = "checkpoints"
36
 
37
  en_suffix = f"{CKPT_BASE_PATH}/base_speakers/EN"
@@ -42,11 +43,9 @@ enable_chinese_lang = False
42
 
43
  def download_from_hf_hub(filename, local_dir="./"):
44
  from huggingface_hub import hf_hub_download
45
-
46
  os.makedirs(local_dir, exist_ok=True)
47
  hf_hub_download(repo_id="myshell-ai/OpenVoice", filename=filename, local_dir=local_dir)
48
 
49
-
50
  download_from_hf_hub(f"{converter_suffix}/checkpoint.pth")
51
  download_from_hf_hub(f"{converter_suffix}/config.json")
52
  download_from_hf_hub(f"{en_suffix}/checkpoint.pth")
@@ -68,20 +67,6 @@ en_base_speaker_tts.load_ckpt(f"{en_suffix}/checkpoint.pth")
68
  tone_color_converter = ToneColorConverter(f"{converter_suffix}/config.json", device=pt_device)
69
  tone_color_converter.load_ckpt(f"{converter_suffix}/checkpoint.pth")
70
 
71
- if enable_chinese_lang:
72
- zh_base_speaker_tts = BaseSpeakerTTS(f"{zh_suffix}/config.json", device=pt_device)
73
- zh_base_speaker_tts.load_ckpt(f"{zh_suffix}/checkpoint.pth")
74
- else:
75
- zh_base_speaker_tts = None
76
-
77
- pt_device = "cpu"
78
-
79
- en_base_speaker_tts = BaseSpeakerTTS(f"{en_suffix}/config.json", device=pt_device)
80
- en_base_speaker_tts.load_ckpt(f"{en_suffix}/checkpoint.pth")
81
-
82
- tone_color_converter = ToneColorConverter(f"{converter_suffix}/config.json", device=pt_device)
83
- tone_color_converter.load_ckpt(f"{converter_suffix}/checkpoint.pth")
84
-
85
  if enable_chinese_lang:
86
  zh_base_speaker_tts = BaseSpeakerTTS(f"{zh_suffix}/config.json", device=pt_device)
87
  zh_base_speaker_tts.load_ckpt(f"{zh_suffix}/checkpoint.pth")
@@ -89,22 +74,13 @@ else:
89
  zh_base_speaker_tts = None
90
 
91
  class OVOpenVoiceBase(torch.nn.Module):
92
- """
93
- Base class for both TTS and voice tone conversion model: constructor is same for both of them.
94
- """
95
-
96
  def __init__(self, voice_model: OpenVoiceBaseClass):
97
  super().__init__()
98
  self.voice_model = voice_model
99
  for par in voice_model.model.parameters():
100
  par.requires_grad = False
101
 
102
-
103
  class OVOpenVoiceTTS(OVOpenVoiceBase):
104
- """
105
- Constructor of this class accepts BaseSpeakerTTS object for speech generation and wraps it's 'infer' method with forward.
106
- """
107
-
108
  def get_example_input(self):
109
  stn_tst = self.voice_model.get_text("this is original text", self.voice_model.hps, False)
110
  x_tst = stn_tst.unsqueeze(0)
@@ -125,12 +101,7 @@ class OVOpenVoiceTTS(OVOpenVoiceBase):
125
  def forward(self, x, x_lengths, sid, noise_scale, length_scale, noise_scale_w):
126
  return self.voice_model.model.infer(x, x_lengths, sid, noise_scale, length_scale, noise_scale_w)
127
 
128
-
129
  class OVOpenVoiceConverter(OVOpenVoiceBase):
130
- """
131
- Constructor of this class accepts ToneColorConverter object for voice tone conversion and wraps it's 'voice_conversion' method with forward.
132
- """
133
-
134
  def get_example_input(self):
135
  y = torch.randn([1, 513, 238], dtype=torch.float32)
136
  y_lengths = torch.LongTensor([y.size(-1)])
@@ -142,9 +113,6 @@ class OVOpenVoiceConverter(OVOpenVoiceBase):
142
  def forward(self, y, y_lengths, sid_src, sid_tgt, tau):
143
  return self.voice_model.model.voice_conversion(y, y_lengths, sid_src, sid_tgt, tau)
144
 
145
- import nncf
146
-
147
-
148
  IRS_PATH = "openvino_irs/"
149
  EN_TTS_IR = f"{IRS_PATH}/openvoice_en_tts.xml"
150
  ZH_TTS_IR = f"{IRS_PATH}/openvoice_zh_tts.xml"
@@ -180,46 +148,11 @@ reference_speakers = [
180
  "load_manually",
181
  ]
182
 
183
- ref_speaker = widgets.Dropdown(
184
- options=reference_speakers,
185
- value=reference_speakers[0],
186
- description="reference voice from which tone color will be copied",
187
- disabled=False,
188
- )
189
-
190
- ref_speaker
191
-
192
- OUTPUT_DIR = "outputs/"
193
- os.makedirs(OUTPUT_DIR, exist_ok=True)
194
-
195
- ref_speaker_path = f"{REFERENCE_VOICES_PATH}/{ref_speaker.value}"
196
- allowed_audio_types = ".mp4,.mp3,.wav,.wma,.aac,.m4a,.m4b,.webm"
197
-
198
- if ref_speaker.value == "record_manually":
199
- ref_speaker_path = f"{OUTPUT_DIR}/custom_example_sample.webm"
200
- from ipywebrtc import AudioRecorder, CameraStream
201
-
202
- camera = CameraStream(constraints={"audio": True, "video": False})
203
- recorder = AudioRecorder(stream=camera, filename=ref_speaker_path, autosave=True)
204
- display(recorder)
205
- elif ref_speaker.value == "load_manually":
206
- upload_ref = widgets.FileUpload(
207
- accept=allowed_audio_types,
208
- multiple=False,
209
- description="Select audio with reference voice",
210
- )
211
- display(upload_ref)
212
-
213
  def save_audio(voice_source: widgets.FileUpload, out_path: str):
214
  with open(out_path, "wb") as output_file:
215
  assert len(voice_source.value) > 0, "Please select audio file"
216
  output_file.write(voice_source.value[0]["content"])
217
 
218
-
219
- if ref_speaker.value == "load_manually":
220
- ref_speaker_path = f"{OUTPUT_DIR}/{upload_ref.value[0].name}"
221
- save_audio(upload_ref, ref_speaker_path)
222
-
223
  en_source_default_se = torch.load(f"{en_suffix}/en_default_se.pth")
224
  en_source_style_se = torch.load(f"{en_suffix}/en_style_se.pth")
225
  zh_source_se = torch.load(f"{zh_suffix}/zh_default_se.pth") if enable_chinese_lang else None
@@ -235,7 +168,6 @@ def get_pathched_infer(ov_model: ov.Model, device: str) -> callable:
235
 
236
  return infer_impl
237
 
238
-
239
  def get_patched_voice_conversion(ov_model: ov.Model, device: str) -> callable:
240
  compiled_model = core.compile_model(ov_model, device)
241
 
@@ -245,71 +177,13 @@ def get_patched_voice_conversion(ov_model: ov.Model, device: str) -> callable:
245
 
246
  return voice_conversion_impl
247
 
248
-
249
  en_base_speaker_tts.model.infer = get_pathched_infer(ov_en_tts, device.value)
250
  tone_color_converter.model.voice_conversion = get_patched_voice_conversion(ov_voice_conversion, device.value)
251
  if enable_chinese_lang:
252
  zh_base_speaker_tts.model.infer = get_pathched_infer(ov_zh_tts, device.value)
253
 
254
- voice_source = widgets.Dropdown(
255
- options=["use TTS", "choose_manually"],
256
- value="use TTS",
257
- description="Voice source",
258
- disabled=False,
259
- )
260
-
261
- voice_source
262
-
263
- if voice_source.value == "choose_manually":
264
- upload_orig_voice = widgets.FileUpload(
265
- accept=allowed_audio_types,
266
- multiple=False,
267
- description="audo whose tone will be replaced",
268
- )
269
- display(upload_orig_voice)
270
-
271
- if voice_source.value == "choose_manually":
272
- orig_voice_path = f"{OUTPUT_DIR}/{upload_orig_voice.value[0].name}"
273
- save_audio(upload_orig_voice, orig_voice_path)
274
- source_se, _ = se_extractor.get_se(orig_voice_path, tone_color_converter, target_dir=OUTPUT_DIR, vad=True)
275
- else:
276
- text = """
277
- OpenVINO toolkit is a comprehensive toolkit for quickly developing applications and solutions that solve
278
- a variety of tasks including emulation of human vision, automatic speech recognition, natural language processing,
279
- recommendation systems, and many others.
280
- """
281
- source_se = en_source_default_se
282
- orig_voice_path = f"{OUTPUT_DIR}/tmp.wav"
283
- en_base_speaker_tts.tts(text, orig_voice_path, speaker="default", language="English")
284
-
285
- tau_slider = widgets.FloatSlider(
286
- value=0.3,
287
- min=0.01,
288
- max=2.0,
289
- step=0.01,
290
- description="tau",
291
- disabled=False,
292
- readout_format=".2f",
293
- )
294
- tau_slider
295
-
296
- resulting_voice_path = f"{OUTPUT_DIR}/output_with_cloned_voice_tone.wav"
297
-
298
- tone_color_converter.convert(
299
- audio_src_path=orig_voice_path,
300
- src_se=source_se,
301
- tgt_se=target_se,
302
- output_path=resulting_voice_path,
303
- tau=tau_slider.value,
304
- message="@MyShell",
305
- )
306
-
307
- import gradio as gr
308
- import langid
309
-
310
  supported_languages = ["zh", "en"]
311
 
312
-
313
  def build_predict(
314
  output_dir,
315
  tone_color_converter,
@@ -318,273 +192,62 @@ def build_predict(
318
  en_source_default_se,
319
  en_source_style_se,
320
  zh_source_se,
 
321
  ):
322
- def predict(prompt, style, audio_file_pth, agree):
323
- return predict_impl(
324
- prompt,
325
- style,
326
- audio_file_pth,
327
- agree,
328
- output_dir,
329
- tone_color_converter,
330
- en_tts_model,
331
- zh_tts_model,
332
- en_source_default_se,
333
- en_source_style_se,
334
- zh_source_se,
335
- )
336
-
337
- return predict
338
-
339
-
340
- def predict_impl(
341
- prompt,
342
- style,
343
- audio_file_pth,
344
- agree,
345
- output_dir,
346
- tone_color_converter,
347
- en_tts_model,
348
- zh_tts_model,
349
- en_source_default_se,
350
- en_source_style_se,
351
- zh_source_se,
352
- ):
353
- text_hint = ""
354
- if not agree:
355
- text_hint += "[ERROR] Please accept the Terms & Condition!\n"
356
- gr.Warning("Please accept the Terms & Condition!")
357
- return (
358
- text_hint,
359
- None,
360
- None,
361
- )
362
-
363
- language_predicted = langid.classify(prompt)[0].strip()
364
- print(f"Detected language:{language_predicted}")
365
-
366
- if language_predicted not in supported_languages:
367
- text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n"
368
- gr.Warning(f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}")
369
 
370
- return (
371
- text_hint,
372
- None,
373
- )
 
374
 
375
- if language_predicted == "zh":
376
- tts_model = zh_tts_model
377
- if zh_tts_model is None:
378
- gr.Warning("TTS model for Chinece language was not loaded please set 'enable_chinese_lang=True`")
379
- return (
380
- text_hint,
381
- None,
382
- )
383
- source_se = zh_source_se
384
- language = "Chinese"
385
- if style not in ["default"]:
386
- text_hint += f"[ERROR] The style {style} is not supported for Chinese, which should be in ['default']\n"
387
- gr.Warning(f"The style {style} is not supported for Chinese, which should be in ['default']")
388
- return (
389
- text_hint,
390
- None,
391
- )
392
- else:
393
- tts_model = en_tts_model
394
- if style == "default":
395
- source_se = en_source_default_se
396
- else:
397
- source_se = en_source_style_se
398
- language = "English"
399
- supported_styles = [
400
- "default",
401
- "whispering",
402
- "shouting",
403
- "excited",
404
- "cheerful",
405
- "terrified",
406
- "angry",
407
- "sad",
408
- "friendly",
409
- ]
410
- if style not in supported_styles:
411
- text_hint += f"[ERROR] The style {style} is not supported for English, which should be in {*supported_styles,}\n"
412
- gr.Warning(f"The style {style} is not supported for English, which should be in {*supported_styles,}")
413
- return (
414
- text_hint,
415
- None,
416
- )
417
-
418
- speaker_wav = audio_file_pth
419
-
420
- if len(prompt) < 2:
421
- text_hint += "[ERROR] Please give a longer prompt text \n"
422
- gr.Warning("Please give a longer prompt text")
423
- return (
424
- text_hint,
425
- None,
426
- )
427
- if len(prompt) > 200:
428
- text_hint += (
429
- "[ERROR] Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo and try for your usage \n"
430
- )
431
- gr.Warning("Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo for your usage")
432
- return (
433
- text_hint,
434
- None,
435
- )
436
 
437
- # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
438
- try:
439
- target_se, audio_name = se_extractor.get_se(speaker_wav, tone_color_converter, target_dir=OUTPUT_DIR, vad=True)
440
- except Exception as e:
441
- text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
442
- gr.Warning("[ERROR] Get target tone color error {str(e)} \n")
443
- return (
444
- text_hint,
445
- None,
446
- )
447
 
448
- src_path = f"{output_dir}/tmp.wav"
449
- tts_model.tts(prompt, src_path, speaker=style, language=language)
450
-
451
- save_path = f"{output_dir}/output.wav"
452
- encode_message = "@MyShell"
453
- tone_color_converter.convert(
454
- audio_src_path=src_path,
455
- src_se=source_se,
456
- tgt_se=target_se,
457
- output_path=save_path,
458
- message=encode_message,
459
- )
460
-
461
- text_hint += "Get response successfully \n"
462
-
463
- return (
464
- text_hint,
465
- src_path,
466
- save_path,
467
- )
468
-
469
-
470
- description = """
471
- # OpenVoice accelerated by OpenVINO:
472
-
473
- a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set.
474
- """
475
-
476
- content = """
477
- <div>
478
- <strong>If the generated voice does not sound like the reference voice, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/docs/QA.md'>this QnA</a>.</strong> <strong>For multi-lingual & cross-lingual examples, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/demo_part2.ipynb'>this jupyter notebook</a>.</strong>
479
- This online demo mainly supports <strong>English</strong>. The <em>default</em> style also supports <strong>Chinese</strong>. But OpenVoice can adapt to any other language as long as a base speaker is provided.
480
- </div>
481
- """
482
- wrapped_markdown_content = f"<div style='border: 1px solid #000; padding: 10px;'>{content}</div>"
483
-
484
-
485
- examples = [
486
- [
487
- "今天天气真好,我们一起出去吃饭吧。",
488
- "default",
489
- "OpenVoice/resources/demo_speaker1.mp3",
490
- True,
491
- ],
492
- [
493
- "This audio is generated by open voice with a half-performance model.",
494
- "whispering",
495
- "OpenVoice/resources/demo_speaker2.mp3",
496
- True,
497
- ],
498
- [
499
- "He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
500
- "sad",
501
- "OpenVoice/resources/demo_speaker0.mp3",
502
- True,
503
- ],
504
- ]
505
 
 
 
506
 
507
- def get_demo(
508
- output_dir,
509
- tone_color_converter,
510
- en_tts_model,
511
- zh_tts_model,
512
- en_source_default_se,
513
- en_source_style_se,
514
- zh_source_se,
515
- ):
516
- with gr.Blocks(analytics_enabled=False) as demo:
517
- with gr.Row():
518
- gr.Markdown(description)
519
- with gr.Row():
520
- gr.HTML(wrapped_markdown_content)
521
-
522
- with gr.Row():
523
- with gr.Column():
524
- input_text_gr = gr.Textbox(
525
- label="Text Prompt",
526
- info="One or two sentences at a time is better. Up to 200 text characters.",
527
- value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
528
- )
529
- style_gr = gr.Dropdown(
530
- label="Style",
531
- info="Select a style of output audio for the synthesised speech. (Chinese only support 'default' now)",
532
- choices=[
533
- "default",
534
- "whispering",
535
- "cheerful",
536
- "terrified",
537
- "angry",
538
- "sad",
539
- "friendly",
540
- ],
541
- max_choices=1,
542
- value="default",
543
- )
544
- ref_gr = gr.Audio(
545
- label="Reference Audio",
546
- type="filepath",
547
- value="OpenVoice/resources/demo_speaker2.mp3",
548
- )
549
- tos_gr = gr.Checkbox(
550
- label="Agree",
551
- value=False,
552
- info="I agree to the terms of the cc-by-nc-4.0 license-: https://github.com/myshell-ai/OpenVoice/blob/main/LICENSE",
553
- )
554
-
555
- tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
556
-
557
- with gr.Column():
558
- out_text_gr = gr.Text(label="Info")
559
- audio_orig_gr = gr.Audio(label="Synthesised Audio", autoplay=False)
560
- audio_gr = gr.Audio(label="Audio with cloned voice", autoplay=True)
561
- # ref_audio_gr = gr.Audio(label="Reference Audio Used")
562
- predict = build_predict(
563
- output_dir,
564
- tone_color_converter,
565
- en_tts_model,
566
- zh_tts_model,
567
- en_source_default_se,
568
- en_source_style_se,
569
- zh_source_se,
570
- )
571
-
572
- gr.Examples(
573
- examples,
574
- label="Examples",
575
- inputs=[input_text_gr, style_gr, ref_gr, tos_gr],
576
- outputs=[out_text_gr, audio_gr],
577
- fn=predict,
578
- cache_examples=False,
579
- )
580
- tts_button.click(
581
- predict,
582
- [input_text_gr, style_gr, ref_gr, tos_gr],
583
- outputs=[out_text_gr, audio_orig_gr, audio_gr],
584
- )
585
- return demo
586
-
587
- demo = get_demo(
588
  OUTPUT_DIR,
589
  tone_color_converter,
590
  en_base_speaker_tts,
@@ -592,13 +255,26 @@ demo = get_demo(
592
  en_source_default_se,
593
  en_source_style_se,
594
  zh_source_se,
 
595
  )
596
- demo.queue(max_size=2)
597
-
598
- try:
599
- demo.launch(debug=True, height=1000)
600
- except Exception:
601
- demo.launch(share=True, debug=True, height=1000)
602
- # if you are launching remotely, specify server_name and server_port
603
- # demo.launch(server_name='your server name', server_port='server port in int')
604
- # Read more in the docs: https://gradio.app/docs/
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import sys
2
  from pathlib import Path
3
+ import os
4
+ import torch
5
+ import openvino as ov
6
+ import gradio as gr
7
+ import langid
8
+ import ipywidgets as widgets
9
+ from IPython.display import Audio
10
+ from openvoice.api import BaseSpeakerTTS, ToneColorConverter, OpenVoiceBaseClass
11
+ import openvoice.se_extractor as se_extractor
12
+ import nncf
13
 
14
+ # Clone the repo and set up the environment
15
  repo_dir = Path("OpenVoice")
 
16
  if not repo_dir.exists():
17
  !git clone https://github.com/myshell-ai/OpenVoice
18
  orig_english_path = Path("OpenVoice/openvoice/text/_orig_english.py")
 
25
  data = data.replace("unidecode", "anyascii")
26
  with english_path.open("w") as out_f:
27
  out_f.write(data)
 
28
  sys.path.append(str(repo_dir))
29
 
30
+ # Install the required packages
31
  %pip install -q "librosa>=0.8.1" "wavmark>=0.0.3" "faster-whisper>=0.9.0" "pydub>=0.25.1" "whisper-timestamped>=1.14.2" "tqdm" "inflect>=7.0.0" "eng_to_ipa>=0.0.2" "pypinyin>=0.50.0" \
32
  "cn2an>=0.5.22" "jieba>=0.42.1" "langid>=1.1.6" "gradio>=4.15" "ipywebrtc" "anyascii" "openvino>=2023.3" "torch>=2.1" "nncf>=2.11.0"
33
 
 
 
 
 
 
 
34
  core = ov.Core()
35
 
 
 
 
36
  CKPT_BASE_PATH = "checkpoints"
37
 
38
  en_suffix = f"{CKPT_BASE_PATH}/base_speakers/EN"
 
43
 
44
  def download_from_hf_hub(filename, local_dir="./"):
45
  from huggingface_hub import hf_hub_download
 
46
  os.makedirs(local_dir, exist_ok=True)
47
  hf_hub_download(repo_id="myshell-ai/OpenVoice", filename=filename, local_dir=local_dir)
48
 
 
49
  download_from_hf_hub(f"{converter_suffix}/checkpoint.pth")
50
  download_from_hf_hub(f"{converter_suffix}/config.json")
51
  download_from_hf_hub(f"{en_suffix}/checkpoint.pth")
 
67
  tone_color_converter = ToneColorConverter(f"{converter_suffix}/config.json", device=pt_device)
68
  tone_color_converter.load_ckpt(f"{converter_suffix}/checkpoint.pth")
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  if enable_chinese_lang:
71
  zh_base_speaker_tts = BaseSpeakerTTS(f"{zh_suffix}/config.json", device=pt_device)
72
  zh_base_speaker_tts.load_ckpt(f"{zh_suffix}/checkpoint.pth")
 
74
  zh_base_speaker_tts = None
75
 
76
  class OVOpenVoiceBase(torch.nn.Module):
 
 
 
 
77
  def __init__(self, voice_model: OpenVoiceBaseClass):
78
  super().__init__()
79
  self.voice_model = voice_model
80
  for par in voice_model.model.parameters():
81
  par.requires_grad = False
82
 
 
83
  class OVOpenVoiceTTS(OVOpenVoiceBase):
 
 
 
 
84
  def get_example_input(self):
85
  stn_tst = self.voice_model.get_text("this is original text", self.voice_model.hps, False)
86
  x_tst = stn_tst.unsqueeze(0)
 
101
  def forward(self, x, x_lengths, sid, noise_scale, length_scale, noise_scale_w):
102
  return self.voice_model.model.infer(x, x_lengths, sid, noise_scale, length_scale, noise_scale_w)
103
 
 
104
  class OVOpenVoiceConverter(OVOpenVoiceBase):
 
 
 
 
105
  def get_example_input(self):
106
  y = torch.randn([1, 513, 238], dtype=torch.float32)
107
  y_lengths = torch.LongTensor([y.size(-1)])
 
113
  def forward(self, y, y_lengths, sid_src, sid_tgt, tau):
114
  return self.voice_model.model.voice_conversion(y, y_lengths, sid_src, sid_tgt, tau)
115
 
 
 
 
116
  IRS_PATH = "openvino_irs/"
117
  EN_TTS_IR = f"{IRS_PATH}/openvoice_en_tts.xml"
118
  ZH_TTS_IR = f"{IRS_PATH}/openvoice_zh_tts.xml"
 
148
  "load_manually",
149
  ]
150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  def save_audio(voice_source: widgets.FileUpload, out_path: str):
152
  with open(out_path, "wb") as output_file:
153
  assert len(voice_source.value) > 0, "Please select audio file"
154
  output_file.write(voice_source.value[0]["content"])
155
 
 
 
 
 
 
156
  en_source_default_se = torch.load(f"{en_suffix}/en_default_se.pth")
157
  en_source_style_se = torch.load(f"{en_suffix}/en_style_se.pth")
158
  zh_source_se = torch.load(f"{zh_suffix}/zh_default_se.pth") if enable_chinese_lang else None
 
168
 
169
  return infer_impl
170
 
 
171
  def get_patched_voice_conversion(ov_model: ov.Model, device: str) -> callable:
172
  compiled_model = core.compile_model(ov_model, device)
173
 
 
177
 
178
  return voice_conversion_impl
179
 
 
180
  en_base_speaker_tts.model.infer = get_pathched_infer(ov_en_tts, device.value)
181
  tone_color_converter.model.voice_conversion = get_patched_voice_conversion(ov_voice_conversion, device.value)
182
  if enable_chinese_lang:
183
  zh_base_speaker_tts.model.infer = get_pathched_infer(ov_zh_tts, device.value)
184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  supported_languages = ["zh", "en"]
186
 
 
187
  def build_predict(
188
  output_dir,
189
  tone_color_converter,
 
192
  en_source_default_se,
193
  en_source_style_se,
194
  zh_source_se,
195
+ supported_languages,
196
  ):
197
+ def predict(
198
+ input_text,
199
+ reference_audio,
200
+ speaker,
201
+ noise_scale=0.667,
202
+ length_scale=1.0,
203
+ noise_scale_w=0.8,
204
+ tone_color=False,
205
+ ):
206
+ if reference_audio:
207
+ ref_audio_path = f"{output_dir}/input_audio.wav"
208
+ save_audio(reference_audio, ref_audio_path)
209
+ target_se, _ = se_extractor.get_se(ref_audio_path, tone_color_converter, target_dir=output_dir, vad=True)
210
+ else:
211
+ if speaker == "record_manually":
212
+ raise ValueError("Manual recording is not implemented in this example.")
213
+ elif speaker == "load_manually":
214
+ raise ValueError("Loading a manual audio file is not implemented in this example.")
215
+ else:
216
+ ref_audio_path = f"{REFERENCE_VOICES_PATH}/{speaker}"
217
+ target_se, _ = se_extractor.get_se(ref_audio_path, tone_color_converter, target_dir=output_dir, vad=True)
218
+
219
+ lang = langid.classify(input_text)[0]
220
+ if lang not in supported_languages:
221
+ return f"Unsupported language: {lang}"
222
+
223
+ tts_model = en_tts_model if lang == "en" else zh_tts_model
224
+
225
+ stn_tst = tts_model.get_text(input_text, tts_model.hps, False)
226
+ x_tst = stn_tst.unsqueeze(0)
227
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
228
+ speaker_id = torch.LongTensor([1])
229
+ noise_scale = torch.tensor(noise_scale)
230
+ length_scale = torch.tensor(length_scale)
231
+ noise_scale_w = torch.tensor(noise_scale_w)
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
+ with torch.no_grad():
234
+ audio = tts_model.model.infer(x_tst, x_tst_lengths, speaker_id, noise_scale, length_scale, noise_scale_w)[0]
235
+ if tone_color:
236
+ source_se = en_source_style_se if lang == "en" else zh_source_se
237
+ audio = tone_color_converter.model.voice_conversion(audio, x_tst_lengths, source_se, target_se, torch.tensor(0.3))[0]
238
 
239
+ audio = audio.squeeze().cpu().numpy()
240
+ output_path = f"{output_dir}/output_audio.wav"
241
+ Audio(audio, rate=tts_model.hps.data.sampling_rate).save(output_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
+ return output_path
 
 
 
 
 
 
 
 
 
244
 
245
+ return predict
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
 
247
+ OUTPUT_DIR = "output_audio"
248
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
249
 
250
+ predict_fn = build_predict(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  OUTPUT_DIR,
252
  tone_color_converter,
253
  en_base_speaker_tts,
 
255
  en_source_default_se,
256
  en_source_style_se,
257
  zh_source_se,
258
+ supported_languages,
259
  )
260
+
261
+ def gradio_interface():
262
+ input_text = gr.inputs.Textbox(lines=2, placeholder="Enter text here...")
263
+ reference_audio = gr.inputs.Audio(source="upload", type="file", label="Reference Audio")
264
+ speaker = gr.inputs.Dropdown(choices=reference_speakers, default="record_manually", label="Select Speaker")
265
+ noise_scale = gr.inputs.Slider(minimum=0.1, maximum=1.0, default=0.667, label="Noise Scale")
266
+ length_scale = gr.inputs.Slider(minimum=0.1, maximum=2.0, default=1.0, label="Length Scale")
267
+ noise_scale_w = gr.inputs.Slider(minimum=0.1, maximum=1.0, default=0.8, label="Noise Scale W")
268
+ tone_color = gr.inputs.Checkbox(default=False, label="Enable Tone Color Conversion")
269
+
270
+ gr.Interface(
271
+ fn=predict_fn,
272
+ inputs=[input_text, reference_audio, speaker, noise_scale, length_scale, noise_scale_w, tone_color],
273
+ outputs=gr.outputs.Audio(type="file", label="Generated Audio"),
274
+ title="Speech Generation and Tone Conversion",
275
+ description="Generate speech and convert tone using the OpenVoice model.",
276
+ ).launch()
277
+
278
+ gradio_interface()
279
+
280
+