|
import sys |
|
from pathlib import Path |
|
import os |
|
import torch |
|
import openvino as ov |
|
import gradio as gr |
|
import langid |
|
import ipywidgets as widgets |
|
from IPython.display import Audio |
|
|
|
|
|
import nncf |
|
import subprocess |
|
|
|
|
|
|
|
repo_dir = Path("OpenVoice") |
|
if not repo_dir.exists(): |
|
subprocess.run(["git", "clone", "https://github.com/myshell-ai/OpenVoice"]) |
|
orig_english_path = Path("OpenVoice/openvoice/text/_orig_english.py") |
|
english_path = Path("OpenVoice/openvoice/text/english.py") |
|
|
|
english_path.rename(orig_english_path) |
|
|
|
with orig_english_path.open("r") as f: |
|
data = f.read() |
|
data = data.replace("unidecode", "anyascii") |
|
with english_path.open("w") as out_f: |
|
out_f.write(data) |
|
sys.path.append(str(repo_dir)) |
|
|
|
|
|
|
|
|
|
|
|
from openvoice.api import BaseSpeakerTTS, ToneColorConverter, OpenVoiceBaseClass |
|
import openvoice.se_extractor as se_extractor |
|
|
|
packages = [ |
|
"librosa>=0.8.1", |
|
"wavmark>=0.0.3", |
|
"faster-whisper>=0.9.0", |
|
"pydub>=0.25.1", |
|
"whisper-timestamped>=1.14.2", |
|
"tqdm", |
|
"inflect>=7.0.0", |
|
"eng_to_ipa>=0.0.2", |
|
"pypinyin>=0.50.0", |
|
"ipywidgets" |
|
] |
|
|
|
subprocess.run(["pip", "install"] + packages, check=True) |
|
|
|
core = ov.Core() |
|
|
|
CKPT_BASE_PATH = "checkpoints" |
|
|
|
en_suffix = f"{CKPT_BASE_PATH}/base_speakers/EN" |
|
zh_suffix = f"{CKPT_BASE_PATH}/base_speakers/ZH" |
|
converter_suffix = f"{CKPT_BASE_PATH}/converter" |
|
|
|
enable_chinese_lang = False |
|
|
|
def download_from_hf_hub(filename, local_dir="./"): |
|
from huggingface_hub import hf_hub_download |
|
os.makedirs(local_dir, exist_ok=True) |
|
hf_hub_download(repo_id="myshell-ai/OpenVoice", filename=filename, local_dir=local_dir) |
|
|
|
download_from_hf_hub(f"{converter_suffix}/checkpoint.pth") |
|
download_from_hf_hub(f"{converter_suffix}/config.json") |
|
download_from_hf_hub(f"{en_suffix}/checkpoint.pth") |
|
download_from_hf_hub(f"{en_suffix}/config.json") |
|
|
|
download_from_hf_hub(f"{en_suffix}/en_default_se.pth") |
|
download_from_hf_hub(f"{en_suffix}/en_style_se.pth") |
|
|
|
if enable_chinese_lang: |
|
download_from_hf_hub(f"{zh_suffix}/checkpoint.pth") |
|
download_from_hf_hub(f"{zh_suffix}/config.json") |
|
download_from_hf_hub(f"{zh_suffix}/zh_default_se.pth") |
|
|
|
pt_device = "cpu" |
|
|
|
en_base_speaker_tts = BaseSpeakerTTS(f"{en_suffix}/config.json", device=pt_device) |
|
en_base_speaker_tts.load_ckpt(f"{en_suffix}/checkpoint.pth") |
|
|
|
tone_color_converter = ToneColorConverter(f"{converter_suffix}/config.json", device=pt_device) |
|
tone_color_converter.load_ckpt(f"{converter_suffix}/checkpoint.pth") |
|
|
|
if enable_chinese_lang: |
|
zh_base_speaker_tts = BaseSpeakerTTS(f"{zh_suffix}/config.json", device=pt_device) |
|
zh_base_speaker_tts.load_ckpt(f"{zh_suffix}/checkpoint.pth") |
|
else: |
|
zh_base_speaker_tts = None |
|
|
|
class OVOpenVoiceBase(torch.nn.Module): |
|
def __init__(self, voice_model: OpenVoiceBaseClass): |
|
super().__init__() |
|
self.voice_model = voice_model |
|
for par in voice_model.model.parameters(): |
|
par.requires_grad = False |
|
|
|
class OVOpenVoiceTTS(OVOpenVoiceBase): |
|
def get_example_input(self): |
|
stn_tst = self.voice_model.get_text("this is original text", self.voice_model.hps, False) |
|
x_tst = stn_tst.unsqueeze(0) |
|
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]) |
|
speaker_id = torch.LongTensor([1]) |
|
noise_scale = torch.tensor(0.667) |
|
length_scale = torch.tensor(1.0) |
|
noise_scale_w = torch.tensor(0.6) |
|
return ( |
|
x_tst, |
|
x_tst_lengths, |
|
speaker_id, |
|
noise_scale, |
|
length_scale, |
|
noise_scale_w, |
|
) |
|
|
|
def forward(self, x, x_lengths, sid, noise_scale, length_scale, noise_scale_w): |
|
return self.voice_model.model.infer(x, x_lengths, sid, noise_scale, length_scale, noise_scale_w) |
|
|
|
class OVOpenVoiceConverter(OVOpenVoiceBase): |
|
def get_example_input(self): |
|
y = torch.randn([1, 513, 238], dtype=torch.float32) |
|
y_lengths = torch.LongTensor([y.size(-1)]) |
|
target_se = torch.randn(*(1, 256, 1)) |
|
source_se = torch.randn(*(1, 256, 1)) |
|
tau = torch.tensor(0.3) |
|
return (y, y_lengths, source_se, target_se, tau) |
|
|
|
def forward(self, y, y_lengths, sid_src, sid_tgt, tau): |
|
return self.voice_model.model.voice_conversion(y, y_lengths, sid_src, sid_tgt, tau) |
|
|
|
IRS_PATH = "openvino_irs/" |
|
EN_TTS_IR = f"{IRS_PATH}/openvoice_en_tts.xml" |
|
ZH_TTS_IR = f"{IRS_PATH}/openvoice_zh_tts.xml" |
|
VOICE_CONVERTER_IR = f"{IRS_PATH}/openvoice_tone_conversion.xml" |
|
|
|
paths = [EN_TTS_IR, VOICE_CONVERTER_IR] |
|
models = [ |
|
OVOpenVoiceTTS(en_base_speaker_tts), |
|
OVOpenVoiceConverter(tone_color_converter), |
|
] |
|
if enable_chinese_lang: |
|
models.append(OVOpenVoiceTTS(zh_base_speaker_tts)) |
|
paths.append(ZH_TTS_IR) |
|
ov_models = [] |
|
|
|
for model, path in zip(models, paths): |
|
if not os.path.exists(path): |
|
ov_model = ov.convert_model(model, example_input=model.get_example_input()) |
|
ov_model = nncf.compress_weights(ov_model) |
|
ov.save_model(ov_model, path) |
|
else: |
|
ov_model = core.read_model(path) |
|
ov_models.append(ov_model) |
|
|
|
ov_en_tts, ov_voice_conversion = ov_models[:2] |
|
if enable_chinese_lang: |
|
ov_zh_tts = ov_models[-1] |
|
|
|
|
|
REFERENCE_VOICES_PATH = f"{repo_dir}/resources/" |
|
reference_speakers = [ |
|
*[path for path in os.listdir(REFERENCE_VOICES_PATH) if os.path.splitext(path)[-1] == ".mp3"], |
|
"record_manually", |
|
"load_manually", |
|
] |
|
|
|
ref_speaker = widgets.Dropdown( |
|
options=reference_speakers, |
|
value=reference_speakers[0], |
|
description="reference voice from which tone color will be copied", |
|
disabled=False, |
|
) |
|
|
|
ref_speaker |
|
|
|
OUTPUT_DIR = "outputs/" |
|
os.makedirs(OUTPUT_DIR, exist_ok=True) |
|
|
|
ref_speaker_path = f"{REFERENCE_VOICES_PATH}/{ref_speaker.value}" |
|
allowed_audio_types = ".mp4,.mp3,.wav,.wma,.aac,.m4a,.m4b,.webm" |
|
|
|
if ref_speaker.value == "record_manually": |
|
ref_speaker_path = f"{OUTPUT_DIR}/custom_example_sample.webm" |
|
from ipywebrtc import AudioRecorder, CameraStream |
|
|
|
camera = CameraStream(constraints={"audio": True, "video": False}) |
|
recorder = AudioRecorder(stream=camera, filename=ref_speaker_path, autosave=True) |
|
display(recorder) |
|
|
|
elif ref_speaker.value == "load_manually": |
|
upload_ref = widgets.FileUpload( |
|
accept=allowed_audio_types, |
|
multiple=False, |
|
description="Select audio with reference voice", |
|
) |
|
display(upload_ref) |
|
|
|
def save_audio(voice_source: widgets.FileUpload, out_path: str): |
|
with open(out_path, "wb") as output_file: |
|
assert len(voice_source.value) > 0, "Please select audio file" |
|
output_file.write(voice_source.value[0]["content"]) |
|
|
|
en_source_default_se = torch.load(f"{en_suffix}/en_default_se.pth") |
|
en_source_style_se = torch.load(f"{en_suffix}/en_style_se.pth") |
|
zh_source_se = torch.load(f"{zh_suffix}/zh_default_se.pth") if enable_chinese_lang else None |
|
|
|
target_se, audio_name = se_extractor.get_se(ref_speaker_path, tone_color_converter, target_dir=OUTPUT_DIR, vad=True) |
|
|
|
def get_pathched_infer(ov_model: ov.Model, device: str) -> callable: |
|
compiled_model = core.compile_model(ov_model, device) |
|
|
|
def infer_impl(x, x_lengths, sid, noise_scale, length_scale, noise_scale_w): |
|
ov_output = compiled_model((x, x_lengths, sid, noise_scale, length_scale, noise_scale_w)) |
|
return (torch.tensor(ov_output[0]),) |
|
|
|
return infer_impl |
|
|
|
def get_patched_voice_conversion(ov_model: ov.Model, device: str) -> callable: |
|
compiled_model = core.compile_model(ov_model, device) |
|
|
|
def voice_conversion_impl(y, y_lengths, sid_src, sid_tgt, tau): |
|
ov_output = compiled_model((y, y_lengths, sid_src, sid_tgt, tau)) |
|
return (torch.tensor(ov_output[0]),) |
|
|
|
return voice_conversion_impl |
|
|
|
core = ov.Core() |
|
|
|
device = widgets.Dropdown( |
|
options=core.available_devices + ["AUTO"], |
|
value="AUTO", |
|
description="Device:", |
|
disabled=False, |
|
) |
|
device |
|
|
|
en_base_speaker_tts.model.infer = get_pathched_infer(ov_en_tts, device.value) |
|
tone_color_converter.model.voice_conversion = get_patched_voice_conversion(ov_voice_conversion, device.value) |
|
if enable_chinese_lang: |
|
zh_base_speaker_tts.model.infer = get_pathched_infer(ov_zh_tts, device.value) |
|
|
|
supported_languages = ["zh", "en"] |
|
|
|
def build_predict( |
|
output_dir, |
|
tone_color_converter, |
|
en_tts_model, |
|
zh_tts_model, |
|
en_source_default_se, |
|
en_source_style_se, |
|
zh_source_se, |
|
supported_languages, |
|
): |
|
def predict( |
|
input_text, |
|
reference_audio, |
|
speaker, |
|
noise_scale=0.667, |
|
length_scale=1.0, |
|
noise_scale_w=0.8, |
|
tone_color=False, |
|
): |
|
if reference_audio: |
|
ref_audio_path = f"{output_dir}/input_audio.wav" |
|
save_audio(reference_audio, ref_audio_path) |
|
target_se, _ = se_extractor.get_se(ref_audio_path, tone_color_converter, target_dir=output_dir, vad=True) |
|
else: |
|
if speaker == "record_manually": |
|
raise ValueError("Manual recording is not implemented in this example.") |
|
elif speaker == "load_manually": |
|
raise ValueError("Loading a manual audio file is not implemented in this example.") |
|
else: |
|
ref_audio_path = f"{REFERENCE_VOICES_PATH}/{speaker}" |
|
target_se, _ = se_extractor.get_se(ref_audio_path, tone_color_converter, target_dir=output_dir, vad=True) |
|
|
|
lang = langid.classify(input_text)[0] |
|
if lang not in supported_languages: |
|
return f"Unsupported language: {lang}" |
|
|
|
tts_model = en_tts_model if lang == "en" else zh_tts_model |
|
|
|
stn_tst = tts_model.get_text(input_text, tts_model.hps, False) |
|
x_tst = stn_tst.unsqueeze(0) |
|
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]) |
|
speaker_id = torch.LongTensor([1]) |
|
noise_scale = torch.tensor(noise_scale) |
|
length_scale = torch.tensor(length_scale) |
|
noise_scale_w = torch.tensor(noise_scale_w) |
|
|
|
with torch.no_grad(): |
|
audio = tts_model.model.infer(x_tst, x_tst_lengths, speaker_id, noise_scale, length_scale, noise_scale_w)[0] |
|
if tone_color: |
|
source_se = en_source_style_se if lang == "en" else zh_source_se |
|
audio = tone_color_converter.model.voice_conversion(audio, x_tst_lengths, source_se, target_se, torch.tensor(0.3))[0] |
|
|
|
audio = audio.squeeze().cpu().numpy() |
|
output_path = f"{output_dir}/output_audio.wav" |
|
Audio(audio, rate=tts_model.hps.data.sampling_rate).save(output_path) |
|
|
|
return output_path |
|
|
|
return predict |
|
|
|
OUTPUT_DIR = "output_audio" |
|
os.makedirs(OUTPUT_DIR, exist_ok=True) |
|
|
|
predict_fn = build_predict( |
|
OUTPUT_DIR, |
|
tone_color_converter, |
|
en_base_speaker_tts, |
|
zh_base_speaker_tts, |
|
en_source_default_se, |
|
en_source_style_se, |
|
zh_source_se, |
|
supported_languages, |
|
) |
|
|
|
def gradio_interface(): |
|
input_text = gr.Textbox(lines=2, placeholder="Enter text here...") |
|
reference_audio = gr.Audio(type="filepath", label="Reference Audio") |
|
speaker = gr.Dropdown(choices=reference_speakers, value="record_manually", label="Select Speaker") |
|
noise_scale = gr.Slider(minimum=0.1, maximum=1.0, value=0.667, label="Noise Scale") |
|
length_scale = gr.Slider(minimum=0.1, maximum=2.0, value=1.0, label="Length Scale") |
|
noise_scale_w = gr.Slider(minimum=0.1, maximum=1.0, value=0.8, label="Noise Scale W") |
|
tone_color = gr.Checkbox(value=False, label="Enable Tone Color Conversion") |
|
|
|
gr.Interface( |
|
fn=predict_fn, |
|
inputs=[input_text, reference_audio, speaker, noise_scale, length_scale, noise_scale_w, tone_color], |
|
outputs=gr.Audio(type="filepath", label="Generated Audio"), |
|
title="Speech Generation and Tone Conversion", |
|
description="Generate speech and convert tone using the OpenVoice model.", |
|
).launch() |
|
|
|
|
|
|