File size: 4,143 Bytes
37835db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import os
import torch
import gradio as gr
from openvoice import se_extractor
from openvoice.api import ToneColorConverter
from transformers import pipeline
import scipy
from pathlib import Path

# Output directory setup
output_dir = './openvoice_outputs'
os.makedirs(output_dir, exist_ok=True)

# Function to get model names from a directory
def get_model_names(model_dir):
    model_paths = Path(model_dir).glob('*')
    return [model_path.name for model_path in model_paths if model_path.is_dir()]

def generate_speech(text, model_path):
    synthesiser = pipeline("text-to-speech", model_path, device=0 if torch.cuda.is_available() else -1)
    speech = synthesiser(text)
    
    # Resample to 48kHz if needed
    if speech["sampling_rate"] != 48000:
        resampled_audio = scipy.signal.resample(speech["audio"][0], int(len(speech["audio"][0]) * 48000 / speech["sampling_rate"]))
        sampling_rate = 48000
    else:
        resampled_audio = speech["audio"][0]
        sampling_rate = speech["sampling_rate"]
    
    return sampling_rate, resampled_audio

def save_audio(sampling_rate, audio_data, filename="output.wav"):
    scipy.io.wavfile.write(filename, rate=sampling_rate, data=audio_data)
    return filename

def voice_cloning(base_speaker, reference_speaker, model_version, device_choice, vad_select):
    try:
        # Determine paths and device
        ckpt_converter = f'./OPENVOICE_MODELS/{model_version}'
        device = "cuda:0" if device_choice == "GPU" and torch.cuda.is_available() else "cpu"
        print(f"Device: {device}")
        
        # Load the ToneColorConverter
        tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
        tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')

        # Extract speaker embeddings
        source_se, _ = se_extractor.get_se(base_speaker, tone_color_converter, vad=vad_select)
        target_se, _ = se_extractor.get_se(reference_speaker, tone_color_converter, vad=vad_select)
        
        # Define output file paths
        save_path = f'{output_dir}/output_cloned.wav'
        
        # Perform tone color conversion
        tone_color_converter.convert(
            audio_src_path=base_speaker, 
            src_se=source_se, 
            tgt_se=target_se, 
            output_path=save_path,
        )
        return save_path, "Voice cloning successful!"
    except Exception as e:
        return None, f"Error: {str(e)}"

def ui_fn(text, model_dir, model_name, clone, reference_speaker, model_version, device_choice, vad_select):
    model_path = os.path.join(model_dir, model_name)
    sampling_rate, audio_data = generate_speech(text, model_path)
    audio_file = save_audio(sampling_rate, audio_data)
    
    if clone:
        cloned_audio_file, status = voice_cloning(audio_file, reference_speaker, model_version, device_choice, vad_select)
        return cloned_audio_file, status
    else:
        return audio_file, "Speech generation successful!"

if __name__ == "__main__":
    #model_dir = "./models_mms"
    #model_names = get_model_names(model_dir)
    
    iface = gr.Interface(
        fn=ui_fn,
        inputs=[
            gr.Textbox(label="Text to Synthesize"),
            gr.Textbox(label="Model Path or Id", value="VIZINTZOR/MMS-TTS-THAI-MALE-NARRATOR"),
            #gr.Dropdown(model_names, label="Model"),
            gr.Checkbox(label="Clone Voice", value=False),
            gr.Audio(label="Reference Speaker (Target Voice)", type="filepath"),
            gr.Dropdown(["v1", "v2"], value="v2", label="Model Version"),
            gr.Dropdown(["CPU", "GPU"], value="GPU" if torch.cuda.is_available() else "CPU", label="Device"),
            gr.Checkbox(value=False, label="VAD", interactive=True)
        ],
        outputs=[
            gr.Audio(label="Generated Audio", type="filepath"),
            gr.Textbox(label="Status", interactive=False)
        ],
        title="Text-to-Speech Synthesizer with Voice Cloning",
        description="Enter text and model path to generate speech. Optionally, clone the voice using a reference speaker."
    )
    iface.launch()