File size: 7,716 Bytes
fe29fc3
54e4063
 
fe29fc3
e513ef4
54e4063
 
fe29fc3
54e4063
 
 
 
 
 
 
 
fe29fc3
 
c5cdf3b
 
fe29fc3
54e4063
277aa8c
fe29fc3
 
54e4063
5314d6c
54e4063
 
fe29fc3
a3dedb2
54e4063
b2e0012
c866b21
b0f19c8
54e4063
fe29fc3
54e4063
 
fe29fc3
54e4063
 
 
 
 
 
 
fe29fc3
 
 
 
 
 
 
54e4063
 
fe29fc3
 
 
 
 
 
 
 
 
54e4063
 
 
 
 
 
 
fe29fc3
54e4063
fe29fc3
 
 
 
 
 
 
54e4063
 
 
ca2a828
6465172
54e4063
 
 
 
 
 
5314d6c
54e4063
fe29fc3
54e4063
 
 
 
 
 
 
 
fe29fc3
 
54e4063
fe29fc3
 
 
 
 
 
 
 
 
 
54e4063
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe29fc3
54e4063
 
 
 
 
fe29fc3
 
 
54e4063
 
 
 
 
fe29fc3
 
 
 
 
54e4063
 
fe29fc3
 
 
 
 
 
 
 
54e4063
 
 
 
 
 
 
fe29fc3
 
 
 
54e4063
 
 
 
 
 
 
fe29fc3
54e4063
 
fe29fc3
54e4063
fe29fc3
5314d6c
 
 
1d0f153
5314d6c
 
54e4063
976bb78
c5cdf3b
54e4063
f38dc69
fe29fc3
 
 
 
 
 
 
 
 
f38dc69
 
 
 
 
 
 
54e4063
5314d6c
fe29fc3
54e4063
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import os
import torch
import argparse
import gradio as gr
import sys
#from zipfile import ZipFile
from melo.api import TTS

# Init EN/ZH baseTTS and ToneConvertor
from OpenVoice import se_extractor
from OpenVoice.api import ToneColorConverter
import devicetorch
device = devicetorch.get(torch)
ckpt_converter = 'checkpoints/converter'
tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')


#languages = ["EN_NEWEST", "EN", "ES", "FR", "ZH", "JP", "KR"]
en = ["EN-Default", "EN-US", "EN-BR", "EN_INDIA", "EN-AU"]

LANG = sys.argv[1].strip()
print(f"LANG={LANG}")


#def predict(prompt, style, audio_file_pth, mic_file_path, use_mic, language):
def predict(prompt, audio_file_pth, mic_file_path, use_mic, language, speed):
    # initialize a empty info
    text_hint = ''

    print(f"language = {language}")
    lang_code = language
#    lang_code = language
    if language.startswith("EN"):
       lang_code = "EN"
    tts_model = TTS(language=lang_code, device=device)

    speaker_key = language.lower().replace('_', '-')
    source_se = torch.load(f'checkpoints/base_speakers/ses/{speaker_key}.pth', map_location=device)

    if use_mic == True:
        if mic_file_path is not None:
            speaker_wav = mic_file_path
        else:
            text_hint += f"[ERROR] Please record your voice with Microphone, or uncheck Use Microphone to use reference audios\n"
            gr.Warning(
                "Please record your voice with Microphone, or uncheck Use Microphone to use reference audios"
            )
            return (
                text_hint,
                None,
                None,
            )

    else:
        speaker_wav = audio_file_pth

    if len(prompt) < 2:
        text_hint += f"[ERROR] Please give a longer prompt text \n"
        gr.Warning("Please give a longer prompt text")
        return (
            text_hint,
            None,
            None,
        )
    
    # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
    try:
        target_se, wavs_folder = se_extractor.get_se(speaker_wav, tone_color_converter, target_dir='processed', max_length=60., vad=True)
        # os.system(f'rm -rf {wavs_folder}')
    except Exception as e:
        text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
        gr.Warning(
            "[ERROR] Get target tone color error {str(e)} \n"
        )
        return (
            text_hint,
            None,
            None,
        )

    output_dir = os.path.abspath("output")
    src_path = f'{output_dir}/tmp.wav'

    #speed = 1.0
    print(f"speed = {speed}")

    #tts_model.tts_to_file(prompt, speaker_id, src_path, speaker=style, language=language)
    speaker_ids = tts_model.hps.data.spk2id
    print(f"Speaker_ids= {speaker_ids}, language={language}, speaker_key={speaker_key}")
    speaker_id = speaker_ids[language]

    tts_model.tts_to_file(prompt, speaker_id, src_path, speed=speed)

    save_path = f'{output_dir}/output.wav'
    # Run the tone color converter
    encode_message = "@MyShell"
    tone_color_converter.convert(
        audio_src_path=src_path, 
        src_se=source_se, 
        tgt_se=target_se, 
        output_path=save_path,
        message=encode_message)

    text_hint += f'''Get response successfully \n'''

    return (
        text_hint,
        save_path,
        speaker_wav,
    )


examples = [
    [
        "今天天气真好,我们一起出去吃饭吧。",
#        'default',
        "examples/speaker0.mp3",
        None,
        False,
        "ZH",
    ],
    [
        "お前はもう死んでいる",
#        'default',
        "examples/speaker0.mp3",
        None,
        False,
        "JP",
    ],
    [
        "오빤 강남 스타일",
#        'default',
        "examples/speaker0.mp3",
        None,
        False,
        "KR",
    ],
    [
        "This audio is generated by open voice with a half-performance model.",
#        'whispering',
        "examples/speaker1.mp3",
        None,
        False,
        "EN-BR"
    ],
    [
        "He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
#        'sad',
        "examples/speaker2.mp3",
        None,
        False,
        "EN-BR"
    ],
]

with gr.Blocks(analytics_enabled=False) as demo:

#    with gr.Row():
#        gr.HTML(wrapped_markdown_content)

    with gr.Row():
        with gr.Column():
            input_text_gr = gr.Textbox(
                label="Text Prompt",
                info="One or two sentences at a time is better. Up to 200 text characters.",
                value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
            )
            #style_gr = gr.Dropdown(
            #    label="Style",
            #    info="Select a style of output audio for the synthesised speech. (Chinese only support 'default' now)",
            #    choices=['default', 'whispering', 'cheerful', 'terrified', 'angry', 'sad', 'friendly'],
            #    max_choices=1,
            #    value="default",
            #)
            ref_gr = gr.Audio(
                label="Reference Audio",
                info="Click on the ✎ button to upload your own target speaker audio",
                type="filepath",
                value="examples/speaker0.mp3",
            )
            mic_gr = gr.Audio(
                source="microphone",
                type="filepath",
                info="Use your microphone to record audio",
                label="Use Microphone for Reference",
            )
            use_mic_gr = gr.Checkbox(
                label="Use Microphone",
                value=False,
                info="Notice: Microphone input may not work properly under traffic",
            )
            speed = gr.Slider(
                label="Speed",
                minimum=0.1,
                maximum=3.0,
                value=1.0,
            )
            #language = gr.Radio(['EN-Newest', 'EN-US', 'EN-BR', 'EN_INDIA', 'EN-AU', 'EN-Default', 'ES', 'FR', 'ZH', 'JP', 'KR'], label='Language', value='EN-Newest')
            if LANG.startswith("EN"):
                language = gr.Radio(['EN-US', 'EN-BR', 'EN_INDIA', 'EN-AU', 'EN-Default'], label='Language', value='EN-Default')
            else:
                language = gr.Radio([LANG], value=LANG, visible=False)

            tts_button = gr.Button("Send", elem_id="send-btn", visible=True)


        with gr.Column():
            out_text_gr = gr.Text(label="Info")
            audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
            ref_audio_gr = gr.Audio(label="Reference Audio Used")

#            gr.Examples(examples,
#                        label="Examples",
#                        #inputs=[input_text_gr, style_gr, ref_gr, mic_gr, use_mic_gr, language],
#                        inputs=[input_text_gr, ref_gr, mic_gr, use_mic_gr, language],
#                        outputs=[out_text_gr, audio_gr, ref_audio_gr],
#                        fn=predict,
#                        cache_examples=False,)
            #tts_button.click(predict, [input_text_gr, style_gr, ref_gr, mic_gr, use_mic_gr, language], outputs=[out_text_gr, audio_gr, ref_audio_gr])
            tts_button.click(predict, [input_text_gr, ref_gr, mic_gr, use_mic_gr, language, speed], outputs=[out_text_gr, audio_gr, ref_audio_gr])

demo.queue()  
demo.launch(debug=True, show_api=True)