Update app.py
Browse files
app.py
CHANGED
@@ -1,8 +1,18 @@
|
|
1 |
import sys
|
2 |
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
|
|
4 |
repo_dir = Path("OpenVoice")
|
5 |
-
|
6 |
if not repo_dir.exists():
|
7 |
!git clone https://github.com/myshell-ai/OpenVoice
|
8 |
orig_english_path = Path("OpenVoice/openvoice/text/_orig_english.py")
|
@@ -15,23 +25,14 @@ if not repo_dir.exists():
|
|
15 |
data = data.replace("unidecode", "anyascii")
|
16 |
with english_path.open("w") as out_f:
|
17 |
out_f.write(data)
|
18 |
-
# append to sys.path so that modules from the repo could be imported
|
19 |
sys.path.append(str(repo_dir))
|
20 |
|
|
|
21 |
%pip install -q "librosa>=0.8.1" "wavmark>=0.0.3" "faster-whisper>=0.9.0" "pydub>=0.25.1" "whisper-timestamped>=1.14.2" "tqdm" "inflect>=7.0.0" "eng_to_ipa>=0.0.2" "pypinyin>=0.50.0" \
|
22 |
"cn2an>=0.5.22" "jieba>=0.42.1" "langid>=1.1.6" "gradio>=4.15" "ipywebrtc" "anyascii" "openvino>=2023.3" "torch>=2.1" "nncf>=2.11.0"
|
23 |
|
24 |
-
import os
|
25 |
-
import torch
|
26 |
-
import openvino as ov
|
27 |
-
import ipywidgets as widgets
|
28 |
-
from IPython.display import Audio
|
29 |
-
|
30 |
core = ov.Core()
|
31 |
|
32 |
-
from openvoice.api import BaseSpeakerTTS, ToneColorConverter, OpenVoiceBaseClass
|
33 |
-
import openvoice.se_extractor as se_extractor
|
34 |
-
|
35 |
CKPT_BASE_PATH = "checkpoints"
|
36 |
|
37 |
en_suffix = f"{CKPT_BASE_PATH}/base_speakers/EN"
|
@@ -42,11 +43,9 @@ enable_chinese_lang = False
|
|
42 |
|
43 |
def download_from_hf_hub(filename, local_dir="./"):
|
44 |
from huggingface_hub import hf_hub_download
|
45 |
-
|
46 |
os.makedirs(local_dir, exist_ok=True)
|
47 |
hf_hub_download(repo_id="myshell-ai/OpenVoice", filename=filename, local_dir=local_dir)
|
48 |
|
49 |
-
|
50 |
download_from_hf_hub(f"{converter_suffix}/checkpoint.pth")
|
51 |
download_from_hf_hub(f"{converter_suffix}/config.json")
|
52 |
download_from_hf_hub(f"{en_suffix}/checkpoint.pth")
|
@@ -68,20 +67,6 @@ en_base_speaker_tts.load_ckpt(f"{en_suffix}/checkpoint.pth")
|
|
68 |
tone_color_converter = ToneColorConverter(f"{converter_suffix}/config.json", device=pt_device)
|
69 |
tone_color_converter.load_ckpt(f"{converter_suffix}/checkpoint.pth")
|
70 |
|
71 |
-
if enable_chinese_lang:
|
72 |
-
zh_base_speaker_tts = BaseSpeakerTTS(f"{zh_suffix}/config.json", device=pt_device)
|
73 |
-
zh_base_speaker_tts.load_ckpt(f"{zh_suffix}/checkpoint.pth")
|
74 |
-
else:
|
75 |
-
zh_base_speaker_tts = None
|
76 |
-
|
77 |
-
pt_device = "cpu"
|
78 |
-
|
79 |
-
en_base_speaker_tts = BaseSpeakerTTS(f"{en_suffix}/config.json", device=pt_device)
|
80 |
-
en_base_speaker_tts.load_ckpt(f"{en_suffix}/checkpoint.pth")
|
81 |
-
|
82 |
-
tone_color_converter = ToneColorConverter(f"{converter_suffix}/config.json", device=pt_device)
|
83 |
-
tone_color_converter.load_ckpt(f"{converter_suffix}/checkpoint.pth")
|
84 |
-
|
85 |
if enable_chinese_lang:
|
86 |
zh_base_speaker_tts = BaseSpeakerTTS(f"{zh_suffix}/config.json", device=pt_device)
|
87 |
zh_base_speaker_tts.load_ckpt(f"{zh_suffix}/checkpoint.pth")
|
@@ -89,22 +74,13 @@ else:
|
|
89 |
zh_base_speaker_tts = None
|
90 |
|
91 |
class OVOpenVoiceBase(torch.nn.Module):
|
92 |
-
"""
|
93 |
-
Base class for both TTS and voice tone conversion model: constructor is same for both of them.
|
94 |
-
"""
|
95 |
-
|
96 |
def __init__(self, voice_model: OpenVoiceBaseClass):
|
97 |
super().__init__()
|
98 |
self.voice_model = voice_model
|
99 |
for par in voice_model.model.parameters():
|
100 |
par.requires_grad = False
|
101 |
|
102 |
-
|
103 |
class OVOpenVoiceTTS(OVOpenVoiceBase):
|
104 |
-
"""
|
105 |
-
Constructor of this class accepts BaseSpeakerTTS object for speech generation and wraps it's 'infer' method with forward.
|
106 |
-
"""
|
107 |
-
|
108 |
def get_example_input(self):
|
109 |
stn_tst = self.voice_model.get_text("this is original text", self.voice_model.hps, False)
|
110 |
x_tst = stn_tst.unsqueeze(0)
|
@@ -125,12 +101,7 @@ class OVOpenVoiceTTS(OVOpenVoiceBase):
|
|
125 |
def forward(self, x, x_lengths, sid, noise_scale, length_scale, noise_scale_w):
|
126 |
return self.voice_model.model.infer(x, x_lengths, sid, noise_scale, length_scale, noise_scale_w)
|
127 |
|
128 |
-
|
129 |
class OVOpenVoiceConverter(OVOpenVoiceBase):
|
130 |
-
"""
|
131 |
-
Constructor of this class accepts ToneColorConverter object for voice tone conversion and wraps it's 'voice_conversion' method with forward.
|
132 |
-
"""
|
133 |
-
|
134 |
def get_example_input(self):
|
135 |
y = torch.randn([1, 513, 238], dtype=torch.float32)
|
136 |
y_lengths = torch.LongTensor([y.size(-1)])
|
@@ -142,9 +113,6 @@ class OVOpenVoiceConverter(OVOpenVoiceBase):
|
|
142 |
def forward(self, y, y_lengths, sid_src, sid_tgt, tau):
|
143 |
return self.voice_model.model.voice_conversion(y, y_lengths, sid_src, sid_tgt, tau)
|
144 |
|
145 |
-
import nncf
|
146 |
-
|
147 |
-
|
148 |
IRS_PATH = "openvino_irs/"
|
149 |
EN_TTS_IR = f"{IRS_PATH}/openvoice_en_tts.xml"
|
150 |
ZH_TTS_IR = f"{IRS_PATH}/openvoice_zh_tts.xml"
|
@@ -180,46 +148,11 @@ reference_speakers = [
|
|
180 |
"load_manually",
|
181 |
]
|
182 |
|
183 |
-
ref_speaker = widgets.Dropdown(
|
184 |
-
options=reference_speakers,
|
185 |
-
value=reference_speakers[0],
|
186 |
-
description="reference voice from which tone color will be copied",
|
187 |
-
disabled=False,
|
188 |
-
)
|
189 |
-
|
190 |
-
ref_speaker
|
191 |
-
|
192 |
-
OUTPUT_DIR = "outputs/"
|
193 |
-
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
194 |
-
|
195 |
-
ref_speaker_path = f"{REFERENCE_VOICES_PATH}/{ref_speaker.value}"
|
196 |
-
allowed_audio_types = ".mp4,.mp3,.wav,.wma,.aac,.m4a,.m4b,.webm"
|
197 |
-
|
198 |
-
if ref_speaker.value == "record_manually":
|
199 |
-
ref_speaker_path = f"{OUTPUT_DIR}/custom_example_sample.webm"
|
200 |
-
from ipywebrtc import AudioRecorder, CameraStream
|
201 |
-
|
202 |
-
camera = CameraStream(constraints={"audio": True, "video": False})
|
203 |
-
recorder = AudioRecorder(stream=camera, filename=ref_speaker_path, autosave=True)
|
204 |
-
display(recorder)
|
205 |
-
elif ref_speaker.value == "load_manually":
|
206 |
-
upload_ref = widgets.FileUpload(
|
207 |
-
accept=allowed_audio_types,
|
208 |
-
multiple=False,
|
209 |
-
description="Select audio with reference voice",
|
210 |
-
)
|
211 |
-
display(upload_ref)
|
212 |
-
|
213 |
def save_audio(voice_source: widgets.FileUpload, out_path: str):
|
214 |
with open(out_path, "wb") as output_file:
|
215 |
assert len(voice_source.value) > 0, "Please select audio file"
|
216 |
output_file.write(voice_source.value[0]["content"])
|
217 |
|
218 |
-
|
219 |
-
if ref_speaker.value == "load_manually":
|
220 |
-
ref_speaker_path = f"{OUTPUT_DIR}/{upload_ref.value[0].name}"
|
221 |
-
save_audio(upload_ref, ref_speaker_path)
|
222 |
-
|
223 |
en_source_default_se = torch.load(f"{en_suffix}/en_default_se.pth")
|
224 |
en_source_style_se = torch.load(f"{en_suffix}/en_style_se.pth")
|
225 |
zh_source_se = torch.load(f"{zh_suffix}/zh_default_se.pth") if enable_chinese_lang else None
|
@@ -235,7 +168,6 @@ def get_pathched_infer(ov_model: ov.Model, device: str) -> callable:
|
|
235 |
|
236 |
return infer_impl
|
237 |
|
238 |
-
|
239 |
def get_patched_voice_conversion(ov_model: ov.Model, device: str) -> callable:
|
240 |
compiled_model = core.compile_model(ov_model, device)
|
241 |
|
@@ -245,71 +177,13 @@ def get_patched_voice_conversion(ov_model: ov.Model, device: str) -> callable:
|
|
245 |
|
246 |
return voice_conversion_impl
|
247 |
|
248 |
-
|
249 |
en_base_speaker_tts.model.infer = get_pathched_infer(ov_en_tts, device.value)
|
250 |
tone_color_converter.model.voice_conversion = get_patched_voice_conversion(ov_voice_conversion, device.value)
|
251 |
if enable_chinese_lang:
|
252 |
zh_base_speaker_tts.model.infer = get_pathched_infer(ov_zh_tts, device.value)
|
253 |
|
254 |
-
voice_source = widgets.Dropdown(
|
255 |
-
options=["use TTS", "choose_manually"],
|
256 |
-
value="use TTS",
|
257 |
-
description="Voice source",
|
258 |
-
disabled=False,
|
259 |
-
)
|
260 |
-
|
261 |
-
voice_source
|
262 |
-
|
263 |
-
if voice_source.value == "choose_manually":
|
264 |
-
upload_orig_voice = widgets.FileUpload(
|
265 |
-
accept=allowed_audio_types,
|
266 |
-
multiple=False,
|
267 |
-
description="audo whose tone will be replaced",
|
268 |
-
)
|
269 |
-
display(upload_orig_voice)
|
270 |
-
|
271 |
-
if voice_source.value == "choose_manually":
|
272 |
-
orig_voice_path = f"{OUTPUT_DIR}/{upload_orig_voice.value[0].name}"
|
273 |
-
save_audio(upload_orig_voice, orig_voice_path)
|
274 |
-
source_se, _ = se_extractor.get_se(orig_voice_path, tone_color_converter, target_dir=OUTPUT_DIR, vad=True)
|
275 |
-
else:
|
276 |
-
text = """
|
277 |
-
OpenVINO toolkit is a comprehensive toolkit for quickly developing applications and solutions that solve
|
278 |
-
a variety of tasks including emulation of human vision, automatic speech recognition, natural language processing,
|
279 |
-
recommendation systems, and many others.
|
280 |
-
"""
|
281 |
-
source_se = en_source_default_se
|
282 |
-
orig_voice_path = f"{OUTPUT_DIR}/tmp.wav"
|
283 |
-
en_base_speaker_tts.tts(text, orig_voice_path, speaker="default", language="English")
|
284 |
-
|
285 |
-
tau_slider = widgets.FloatSlider(
|
286 |
-
value=0.3,
|
287 |
-
min=0.01,
|
288 |
-
max=2.0,
|
289 |
-
step=0.01,
|
290 |
-
description="tau",
|
291 |
-
disabled=False,
|
292 |
-
readout_format=".2f",
|
293 |
-
)
|
294 |
-
tau_slider
|
295 |
-
|
296 |
-
resulting_voice_path = f"{OUTPUT_DIR}/output_with_cloned_voice_tone.wav"
|
297 |
-
|
298 |
-
tone_color_converter.convert(
|
299 |
-
audio_src_path=orig_voice_path,
|
300 |
-
src_se=source_se,
|
301 |
-
tgt_se=target_se,
|
302 |
-
output_path=resulting_voice_path,
|
303 |
-
tau=tau_slider.value,
|
304 |
-
message="@MyShell",
|
305 |
-
)
|
306 |
-
|
307 |
-
import gradio as gr
|
308 |
-
import langid
|
309 |
-
|
310 |
supported_languages = ["zh", "en"]
|
311 |
|
312 |
-
|
313 |
def build_predict(
|
314 |
output_dir,
|
315 |
tone_color_converter,
|
@@ -318,273 +192,62 @@ def build_predict(
|
|
318 |
en_source_default_se,
|
319 |
en_source_style_se,
|
320 |
zh_source_se,
|
|
|
321 |
):
|
322 |
-
def predict(
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
)
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
return (
|
358 |
-
text_hint,
|
359 |
-
None,
|
360 |
-
None,
|
361 |
-
)
|
362 |
-
|
363 |
-
language_predicted = langid.classify(prompt)[0].strip()
|
364 |
-
print(f"Detected language:{language_predicted}")
|
365 |
-
|
366 |
-
if language_predicted not in supported_languages:
|
367 |
-
text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n"
|
368 |
-
gr.Warning(f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}")
|
369 |
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
|
|
374 |
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
gr.Warning("TTS model for Chinece language was not loaded please set 'enable_chinese_lang=True`")
|
379 |
-
return (
|
380 |
-
text_hint,
|
381 |
-
None,
|
382 |
-
)
|
383 |
-
source_se = zh_source_se
|
384 |
-
language = "Chinese"
|
385 |
-
if style not in ["default"]:
|
386 |
-
text_hint += f"[ERROR] The style {style} is not supported for Chinese, which should be in ['default']\n"
|
387 |
-
gr.Warning(f"The style {style} is not supported for Chinese, which should be in ['default']")
|
388 |
-
return (
|
389 |
-
text_hint,
|
390 |
-
None,
|
391 |
-
)
|
392 |
-
else:
|
393 |
-
tts_model = en_tts_model
|
394 |
-
if style == "default":
|
395 |
-
source_se = en_source_default_se
|
396 |
-
else:
|
397 |
-
source_se = en_source_style_se
|
398 |
-
language = "English"
|
399 |
-
supported_styles = [
|
400 |
-
"default",
|
401 |
-
"whispering",
|
402 |
-
"shouting",
|
403 |
-
"excited",
|
404 |
-
"cheerful",
|
405 |
-
"terrified",
|
406 |
-
"angry",
|
407 |
-
"sad",
|
408 |
-
"friendly",
|
409 |
-
]
|
410 |
-
if style not in supported_styles:
|
411 |
-
text_hint += f"[ERROR] The style {style} is not supported for English, which should be in {*supported_styles,}\n"
|
412 |
-
gr.Warning(f"The style {style} is not supported for English, which should be in {*supported_styles,}")
|
413 |
-
return (
|
414 |
-
text_hint,
|
415 |
-
None,
|
416 |
-
)
|
417 |
-
|
418 |
-
speaker_wav = audio_file_pth
|
419 |
-
|
420 |
-
if len(prompt) < 2:
|
421 |
-
text_hint += "[ERROR] Please give a longer prompt text \n"
|
422 |
-
gr.Warning("Please give a longer prompt text")
|
423 |
-
return (
|
424 |
-
text_hint,
|
425 |
-
None,
|
426 |
-
)
|
427 |
-
if len(prompt) > 200:
|
428 |
-
text_hint += (
|
429 |
-
"[ERROR] Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo and try for your usage \n"
|
430 |
-
)
|
431 |
-
gr.Warning("Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo for your usage")
|
432 |
-
return (
|
433 |
-
text_hint,
|
434 |
-
None,
|
435 |
-
)
|
436 |
|
437 |
-
|
438 |
-
try:
|
439 |
-
target_se, audio_name = se_extractor.get_se(speaker_wav, tone_color_converter, target_dir=OUTPUT_DIR, vad=True)
|
440 |
-
except Exception as e:
|
441 |
-
text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
|
442 |
-
gr.Warning("[ERROR] Get target tone color error {str(e)} \n")
|
443 |
-
return (
|
444 |
-
text_hint,
|
445 |
-
None,
|
446 |
-
)
|
447 |
|
448 |
-
|
449 |
-
tts_model.tts(prompt, src_path, speaker=style, language=language)
|
450 |
-
|
451 |
-
save_path = f"{output_dir}/output.wav"
|
452 |
-
encode_message = "@MyShell"
|
453 |
-
tone_color_converter.convert(
|
454 |
-
audio_src_path=src_path,
|
455 |
-
src_se=source_se,
|
456 |
-
tgt_se=target_se,
|
457 |
-
output_path=save_path,
|
458 |
-
message=encode_message,
|
459 |
-
)
|
460 |
-
|
461 |
-
text_hint += "Get response successfully \n"
|
462 |
-
|
463 |
-
return (
|
464 |
-
text_hint,
|
465 |
-
src_path,
|
466 |
-
save_path,
|
467 |
-
)
|
468 |
-
|
469 |
-
|
470 |
-
description = """
|
471 |
-
# OpenVoice accelerated by OpenVINO:
|
472 |
-
|
473 |
-
a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set.
|
474 |
-
"""
|
475 |
-
|
476 |
-
content = """
|
477 |
-
<div>
|
478 |
-
<strong>If the generated voice does not sound like the reference voice, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/docs/QA.md'>this QnA</a>.</strong> <strong>For multi-lingual & cross-lingual examples, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/demo_part2.ipynb'>this jupyter notebook</a>.</strong>
|
479 |
-
This online demo mainly supports <strong>English</strong>. The <em>default</em> style also supports <strong>Chinese</strong>. But OpenVoice can adapt to any other language as long as a base speaker is provided.
|
480 |
-
</div>
|
481 |
-
"""
|
482 |
-
wrapped_markdown_content = f"<div style='border: 1px solid #000; padding: 10px;'>{content}</div>"
|
483 |
-
|
484 |
-
|
485 |
-
examples = [
|
486 |
-
[
|
487 |
-
"今天天气真好,我们一起出去吃饭吧。",
|
488 |
-
"default",
|
489 |
-
"OpenVoice/resources/demo_speaker1.mp3",
|
490 |
-
True,
|
491 |
-
],
|
492 |
-
[
|
493 |
-
"This audio is generated by open voice with a half-performance model.",
|
494 |
-
"whispering",
|
495 |
-
"OpenVoice/resources/demo_speaker2.mp3",
|
496 |
-
True,
|
497 |
-
],
|
498 |
-
[
|
499 |
-
"He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
|
500 |
-
"sad",
|
501 |
-
"OpenVoice/resources/demo_speaker0.mp3",
|
502 |
-
True,
|
503 |
-
],
|
504 |
-
]
|
505 |
|
|
|
|
|
506 |
|
507 |
-
|
508 |
-
output_dir,
|
509 |
-
tone_color_converter,
|
510 |
-
en_tts_model,
|
511 |
-
zh_tts_model,
|
512 |
-
en_source_default_se,
|
513 |
-
en_source_style_se,
|
514 |
-
zh_source_se,
|
515 |
-
):
|
516 |
-
with gr.Blocks(analytics_enabled=False) as demo:
|
517 |
-
with gr.Row():
|
518 |
-
gr.Markdown(description)
|
519 |
-
with gr.Row():
|
520 |
-
gr.HTML(wrapped_markdown_content)
|
521 |
-
|
522 |
-
with gr.Row():
|
523 |
-
with gr.Column():
|
524 |
-
input_text_gr = gr.Textbox(
|
525 |
-
label="Text Prompt",
|
526 |
-
info="One or two sentences at a time is better. Up to 200 text characters.",
|
527 |
-
value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
|
528 |
-
)
|
529 |
-
style_gr = gr.Dropdown(
|
530 |
-
label="Style",
|
531 |
-
info="Select a style of output audio for the synthesised speech. (Chinese only support 'default' now)",
|
532 |
-
choices=[
|
533 |
-
"default",
|
534 |
-
"whispering",
|
535 |
-
"cheerful",
|
536 |
-
"terrified",
|
537 |
-
"angry",
|
538 |
-
"sad",
|
539 |
-
"friendly",
|
540 |
-
],
|
541 |
-
max_choices=1,
|
542 |
-
value="default",
|
543 |
-
)
|
544 |
-
ref_gr = gr.Audio(
|
545 |
-
label="Reference Audio",
|
546 |
-
type="filepath",
|
547 |
-
value="OpenVoice/resources/demo_speaker2.mp3",
|
548 |
-
)
|
549 |
-
tos_gr = gr.Checkbox(
|
550 |
-
label="Agree",
|
551 |
-
value=False,
|
552 |
-
info="I agree to the terms of the cc-by-nc-4.0 license-: https://github.com/myshell-ai/OpenVoice/blob/main/LICENSE",
|
553 |
-
)
|
554 |
-
|
555 |
-
tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
|
556 |
-
|
557 |
-
with gr.Column():
|
558 |
-
out_text_gr = gr.Text(label="Info")
|
559 |
-
audio_orig_gr = gr.Audio(label="Synthesised Audio", autoplay=False)
|
560 |
-
audio_gr = gr.Audio(label="Audio with cloned voice", autoplay=True)
|
561 |
-
# ref_audio_gr = gr.Audio(label="Reference Audio Used")
|
562 |
-
predict = build_predict(
|
563 |
-
output_dir,
|
564 |
-
tone_color_converter,
|
565 |
-
en_tts_model,
|
566 |
-
zh_tts_model,
|
567 |
-
en_source_default_se,
|
568 |
-
en_source_style_se,
|
569 |
-
zh_source_se,
|
570 |
-
)
|
571 |
-
|
572 |
-
gr.Examples(
|
573 |
-
examples,
|
574 |
-
label="Examples",
|
575 |
-
inputs=[input_text_gr, style_gr, ref_gr, tos_gr],
|
576 |
-
outputs=[out_text_gr, audio_gr],
|
577 |
-
fn=predict,
|
578 |
-
cache_examples=False,
|
579 |
-
)
|
580 |
-
tts_button.click(
|
581 |
-
predict,
|
582 |
-
[input_text_gr, style_gr, ref_gr, tos_gr],
|
583 |
-
outputs=[out_text_gr, audio_orig_gr, audio_gr],
|
584 |
-
)
|
585 |
-
return demo
|
586 |
-
|
587 |
-
demo = get_demo(
|
588 |
OUTPUT_DIR,
|
589 |
tone_color_converter,
|
590 |
en_base_speaker_tts,
|
@@ -592,13 +255,26 @@ demo = get_demo(
|
|
592 |
en_source_default_se,
|
593 |
en_source_style_se,
|
594 |
zh_source_se,
|
|
|
595 |
)
|
596 |
-
|
597 |
-
|
598 |
-
|
599 |
-
|
600 |
-
|
601 |
-
|
602 |
-
|
603 |
-
|
604 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import sys
|
2 |
from pathlib import Path
|
3 |
+
import os
|
4 |
+
import torch
|
5 |
+
import openvino as ov
|
6 |
+
import gradio as gr
|
7 |
+
import langid
|
8 |
+
import ipywidgets as widgets
|
9 |
+
from IPython.display import Audio
|
10 |
+
from openvoice.api import BaseSpeakerTTS, ToneColorConverter, OpenVoiceBaseClass
|
11 |
+
import openvoice.se_extractor as se_extractor
|
12 |
+
import nncf
|
13 |
|
14 |
+
# Clone the repo and set up the environment
|
15 |
repo_dir = Path("OpenVoice")
|
|
|
16 |
if not repo_dir.exists():
|
17 |
!git clone https://github.com/myshell-ai/OpenVoice
|
18 |
orig_english_path = Path("OpenVoice/openvoice/text/_orig_english.py")
|
|
|
25 |
data = data.replace("unidecode", "anyascii")
|
26 |
with english_path.open("w") as out_f:
|
27 |
out_f.write(data)
|
|
|
28 |
sys.path.append(str(repo_dir))
|
29 |
|
30 |
+
# Install the required packages
|
31 |
%pip install -q "librosa>=0.8.1" "wavmark>=0.0.3" "faster-whisper>=0.9.0" "pydub>=0.25.1" "whisper-timestamped>=1.14.2" "tqdm" "inflect>=7.0.0" "eng_to_ipa>=0.0.2" "pypinyin>=0.50.0" \
|
32 |
"cn2an>=0.5.22" "jieba>=0.42.1" "langid>=1.1.6" "gradio>=4.15" "ipywebrtc" "anyascii" "openvino>=2023.3" "torch>=2.1" "nncf>=2.11.0"
|
33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
core = ov.Core()
|
35 |
|
|
|
|
|
|
|
36 |
CKPT_BASE_PATH = "checkpoints"
|
37 |
|
38 |
en_suffix = f"{CKPT_BASE_PATH}/base_speakers/EN"
|
|
|
43 |
|
44 |
def download_from_hf_hub(filename, local_dir="./"):
|
45 |
from huggingface_hub import hf_hub_download
|
|
|
46 |
os.makedirs(local_dir, exist_ok=True)
|
47 |
hf_hub_download(repo_id="myshell-ai/OpenVoice", filename=filename, local_dir=local_dir)
|
48 |
|
|
|
49 |
download_from_hf_hub(f"{converter_suffix}/checkpoint.pth")
|
50 |
download_from_hf_hub(f"{converter_suffix}/config.json")
|
51 |
download_from_hf_hub(f"{en_suffix}/checkpoint.pth")
|
|
|
67 |
tone_color_converter = ToneColorConverter(f"{converter_suffix}/config.json", device=pt_device)
|
68 |
tone_color_converter.load_ckpt(f"{converter_suffix}/checkpoint.pth")
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
if enable_chinese_lang:
|
71 |
zh_base_speaker_tts = BaseSpeakerTTS(f"{zh_suffix}/config.json", device=pt_device)
|
72 |
zh_base_speaker_tts.load_ckpt(f"{zh_suffix}/checkpoint.pth")
|
|
|
74 |
zh_base_speaker_tts = None
|
75 |
|
76 |
class OVOpenVoiceBase(torch.nn.Module):
|
|
|
|
|
|
|
|
|
77 |
def __init__(self, voice_model: OpenVoiceBaseClass):
|
78 |
super().__init__()
|
79 |
self.voice_model = voice_model
|
80 |
for par in voice_model.model.parameters():
|
81 |
par.requires_grad = False
|
82 |
|
|
|
83 |
class OVOpenVoiceTTS(OVOpenVoiceBase):
|
|
|
|
|
|
|
|
|
84 |
def get_example_input(self):
|
85 |
stn_tst = self.voice_model.get_text("this is original text", self.voice_model.hps, False)
|
86 |
x_tst = stn_tst.unsqueeze(0)
|
|
|
101 |
def forward(self, x, x_lengths, sid, noise_scale, length_scale, noise_scale_w):
|
102 |
return self.voice_model.model.infer(x, x_lengths, sid, noise_scale, length_scale, noise_scale_w)
|
103 |
|
|
|
104 |
class OVOpenVoiceConverter(OVOpenVoiceBase):
|
|
|
|
|
|
|
|
|
105 |
def get_example_input(self):
|
106 |
y = torch.randn([1, 513, 238], dtype=torch.float32)
|
107 |
y_lengths = torch.LongTensor([y.size(-1)])
|
|
|
113 |
def forward(self, y, y_lengths, sid_src, sid_tgt, tau):
|
114 |
return self.voice_model.model.voice_conversion(y, y_lengths, sid_src, sid_tgt, tau)
|
115 |
|
|
|
|
|
|
|
116 |
IRS_PATH = "openvino_irs/"
|
117 |
EN_TTS_IR = f"{IRS_PATH}/openvoice_en_tts.xml"
|
118 |
ZH_TTS_IR = f"{IRS_PATH}/openvoice_zh_tts.xml"
|
|
|
148 |
"load_manually",
|
149 |
]
|
150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
def save_audio(voice_source: widgets.FileUpload, out_path: str):
|
152 |
with open(out_path, "wb") as output_file:
|
153 |
assert len(voice_source.value) > 0, "Please select audio file"
|
154 |
output_file.write(voice_source.value[0]["content"])
|
155 |
|
|
|
|
|
|
|
|
|
|
|
156 |
en_source_default_se = torch.load(f"{en_suffix}/en_default_se.pth")
|
157 |
en_source_style_se = torch.load(f"{en_suffix}/en_style_se.pth")
|
158 |
zh_source_se = torch.load(f"{zh_suffix}/zh_default_se.pth") if enable_chinese_lang else None
|
|
|
168 |
|
169 |
return infer_impl
|
170 |
|
|
|
171 |
def get_patched_voice_conversion(ov_model: ov.Model, device: str) -> callable:
|
172 |
compiled_model = core.compile_model(ov_model, device)
|
173 |
|
|
|
177 |
|
178 |
return voice_conversion_impl
|
179 |
|
|
|
180 |
en_base_speaker_tts.model.infer = get_pathched_infer(ov_en_tts, device.value)
|
181 |
tone_color_converter.model.voice_conversion = get_patched_voice_conversion(ov_voice_conversion, device.value)
|
182 |
if enable_chinese_lang:
|
183 |
zh_base_speaker_tts.model.infer = get_pathched_infer(ov_zh_tts, device.value)
|
184 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
supported_languages = ["zh", "en"]
|
186 |
|
|
|
187 |
def build_predict(
|
188 |
output_dir,
|
189 |
tone_color_converter,
|
|
|
192 |
en_source_default_se,
|
193 |
en_source_style_se,
|
194 |
zh_source_se,
|
195 |
+
supported_languages,
|
196 |
):
|
197 |
+
def predict(
|
198 |
+
input_text,
|
199 |
+
reference_audio,
|
200 |
+
speaker,
|
201 |
+
noise_scale=0.667,
|
202 |
+
length_scale=1.0,
|
203 |
+
noise_scale_w=0.8,
|
204 |
+
tone_color=False,
|
205 |
+
):
|
206 |
+
if reference_audio:
|
207 |
+
ref_audio_path = f"{output_dir}/input_audio.wav"
|
208 |
+
save_audio(reference_audio, ref_audio_path)
|
209 |
+
target_se, _ = se_extractor.get_se(ref_audio_path, tone_color_converter, target_dir=output_dir, vad=True)
|
210 |
+
else:
|
211 |
+
if speaker == "record_manually":
|
212 |
+
raise ValueError("Manual recording is not implemented in this example.")
|
213 |
+
elif speaker == "load_manually":
|
214 |
+
raise ValueError("Loading a manual audio file is not implemented in this example.")
|
215 |
+
else:
|
216 |
+
ref_audio_path = f"{REFERENCE_VOICES_PATH}/{speaker}"
|
217 |
+
target_se, _ = se_extractor.get_se(ref_audio_path, tone_color_converter, target_dir=output_dir, vad=True)
|
218 |
+
|
219 |
+
lang = langid.classify(input_text)[0]
|
220 |
+
if lang not in supported_languages:
|
221 |
+
return f"Unsupported language: {lang}"
|
222 |
+
|
223 |
+
tts_model = en_tts_model if lang == "en" else zh_tts_model
|
224 |
+
|
225 |
+
stn_tst = tts_model.get_text(input_text, tts_model.hps, False)
|
226 |
+
x_tst = stn_tst.unsqueeze(0)
|
227 |
+
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
|
228 |
+
speaker_id = torch.LongTensor([1])
|
229 |
+
noise_scale = torch.tensor(noise_scale)
|
230 |
+
length_scale = torch.tensor(length_scale)
|
231 |
+
noise_scale_w = torch.tensor(noise_scale_w)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
|
233 |
+
with torch.no_grad():
|
234 |
+
audio = tts_model.model.infer(x_tst, x_tst_lengths, speaker_id, noise_scale, length_scale, noise_scale_w)[0]
|
235 |
+
if tone_color:
|
236 |
+
source_se = en_source_style_se if lang == "en" else zh_source_se
|
237 |
+
audio = tone_color_converter.model.voice_conversion(audio, x_tst_lengths, source_se, target_se, torch.tensor(0.3))[0]
|
238 |
|
239 |
+
audio = audio.squeeze().cpu().numpy()
|
240 |
+
output_path = f"{output_dir}/output_audio.wav"
|
241 |
+
Audio(audio, rate=tts_model.hps.data.sampling_rate).save(output_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
|
243 |
+
return output_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
|
245 |
+
return predict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
246 |
|
247 |
+
OUTPUT_DIR = "output_audio"
|
248 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
249 |
|
250 |
+
predict_fn = build_predict(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
251 |
OUTPUT_DIR,
|
252 |
tone_color_converter,
|
253 |
en_base_speaker_tts,
|
|
|
255 |
en_source_default_se,
|
256 |
en_source_style_se,
|
257 |
zh_source_se,
|
258 |
+
supported_languages,
|
259 |
)
|
260 |
+
|
261 |
+
def gradio_interface():
|
262 |
+
input_text = gr.inputs.Textbox(lines=2, placeholder="Enter text here...")
|
263 |
+
reference_audio = gr.inputs.Audio(source="upload", type="file", label="Reference Audio")
|
264 |
+
speaker = gr.inputs.Dropdown(choices=reference_speakers, default="record_manually", label="Select Speaker")
|
265 |
+
noise_scale = gr.inputs.Slider(minimum=0.1, maximum=1.0, default=0.667, label="Noise Scale")
|
266 |
+
length_scale = gr.inputs.Slider(minimum=0.1, maximum=2.0, default=1.0, label="Length Scale")
|
267 |
+
noise_scale_w = gr.inputs.Slider(minimum=0.1, maximum=1.0, default=0.8, label="Noise Scale W")
|
268 |
+
tone_color = gr.inputs.Checkbox(default=False, label="Enable Tone Color Conversion")
|
269 |
+
|
270 |
+
gr.Interface(
|
271 |
+
fn=predict_fn,
|
272 |
+
inputs=[input_text, reference_audio, speaker, noise_scale, length_scale, noise_scale_w, tone_color],
|
273 |
+
outputs=gr.outputs.Audio(type="file", label="Generated Audio"),
|
274 |
+
title="Speech Generation and Tone Conversion",
|
275 |
+
description="Generate speech and convert tone using the OpenVoice model.",
|
276 |
+
).launch()
|
277 |
+
|
278 |
+
gradio_interface()
|
279 |
+
|
280 |
+
|