File size: 3,684 Bytes
37835db
 
 
 
 
 
 
 
 
 
 
 
 
9bb2fdd
 
37835db
9bb2fdd
37835db
 
 
 
 
 
 
9bb2fdd
37835db
 
 
 
 
 
 
 
 
 
 
 
9bb2fdd
37835db
 
 
 
 
 
 
9bb2fdd
37835db
 
9bb2fdd
37835db
 
 
 
 
 
 
 
 
 
 
9bb2fdd
 
37835db
9bb2fdd
37835db
 
 
 
 
 
 
 
 
 
 
9bb2fdd
37835db
 
 
 
 
 
 
 
 
 
e9233e6
9bb2fdd
37835db
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import os
import torch
import gradio as gr
from openvoice import se_extractor
from openvoice.api import ToneColorConverter
from transformers import pipeline
import scipy
from pathlib import Path

# Output directory setup
output_dir = './openvoice_outputs'
os.makedirs(output_dir, exist_ok=True)

def generate_speech(text, model_id):
    synthesiser = pipeline("text-to-speech", model=model_id, device=0 if torch.cuda.is_available() else -1)
    speech = synthesiser(text)

    # Resample to 48kHz if needed
    if speech["sampling_rate"] != 48000:
        resampled_audio = scipy.signal.resample(speech["audio"][0], int(len(speech["audio"][0]) * 48000 / speech["sampling_rate"]))
        sampling_rate = 48000
    else:
        resampled_audio = speech["audio"][0]
        sampling_rate = speech["sampling_rate"]

    return sampling_rate, resampled_audio

def save_audio(sampling_rate, audio_data, filename="output.wav"):
    scipy.io.wavfile.write(filename, rate=sampling_rate, data=audio_data)
    return filename

def voice_cloning(base_speaker, reference_speaker, model_version, device_choice, vad_select):
    try:
        # Determine paths and device
        ckpt_converter = f'./OPENVOICE_MODELS/{model_version}'
        device = "cuda:0" if device_choice == "GPU" and torch.cuda.is_available() else "cpu"
        print(f"Device: {device}")

        # Load the ToneColorConverter
        tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
        tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')

        # Extract speaker embeddings
        source_se, _ = se_extractor.get_se(base_speaker, tone_color_converter, vad=vad_select)
        target_se, _ = se_extractor.get_se(reference_speaker, tone_color_converter, vad=vad_select)

        # Define output file paths
        save_path = f'{output_dir}/output_cloned.wav'

        # Perform tone color conversion
        tone_color_converter.convert(
            audio_src_path=base_speaker, 
            src_se=source_se, 
            tgt_se=target_se, 
            output_path=save_path,
        )
        return save_path, "Voice cloning successful!"
    except Exception as e:
        return None, f"Error: {str(e)}"

def ui_fn(text, model_id, clone, reference_speaker, model_version, device_choice, vad_select):
    sampling_rate, audio_data = generate_speech(text, model_id)
    audio_file = save_audio(sampling_rate, audio_data)

    if clone:
        cloned_audio_file, status = voice_cloning(audio_file, reference_speaker, model_version, device_choice, vad_select)
        return cloned_audio_file, status
    else:
        return audio_file, "Speech generation successful!"

if __name__ == "__main__":
    iface = gr.Interface(
        fn=ui_fn,
        inputs=[
            gr.Textbox(label="Text to Synthesize"),
            gr.Textbox(label="Model ID", value="VIZINTZOR/MMS-TTS-THAI-MALE-NARRATOR"),
            gr.Checkbox(label="Clone Voice", value=False),
            gr.Audio(label="Reference Speaker (Target Voice)", type="filepath"),
            gr.Dropdown(["v1", "v2"], value="v2", label="Model Version"),
            gr.Dropdown(["CPU", "GPU"], value="GPU" if torch.cuda.is_available() else "CPU", label="Device"),
            gr.Checkbox(value=False, label="VAD", interactive=True)
        ],
        outputs=[
            gr.Audio(label="Generated Audio", type="filepath"),
            gr.Textbox(label="Status", interactive=False)
        ],
        title="Text-to-Speech Synthesizer with OpenVoice",
        description="Enter text and model ID to generate speech. Optionally, clone the voice using a reference speaker."
    )
    iface.launch()