Spaces:
Runtime error
Runtime error
Commit
·
54e4063
1
Parent(s):
d8bb336
update
Browse files- app.py +126 -176
- app_locally.py +0 -213
app.py
CHANGED
@@ -1,76 +1,55 @@
|
|
1 |
import os
|
|
|
|
|
2 |
import gradio as gr
|
3 |
-
import
|
4 |
-
import
|
5 |
-
import base64
|
6 |
-
import json
|
7 |
-
import time
|
8 |
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
-
output_dir = 'outputs'
|
13 |
-
os.makedirs(output_dir, exist_ok=True)
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
audio_data = audio_file.read()
|
18 |
-
base64_data = base64.b64encode(audio_data).decode("utf-8")
|
19 |
-
return base64_data
|
20 |
|
21 |
-
|
22 |
-
# initialize a empty info
|
23 |
-
text_hint = ''
|
24 |
-
# agree with the terms
|
25 |
-
if agree == False:
|
26 |
-
text_hint += '[ERROR] Please accept the Terms & Condition!\n'
|
27 |
-
gr.Warning("Please accept the Terms & Condition!")
|
28 |
-
return (
|
29 |
-
text_hint,
|
30 |
-
None,
|
31 |
-
None,
|
32 |
-
)
|
33 |
|
34 |
-
# first detect the input language
|
35 |
-
language_predicted = langid.classify(prompt)[0].strip()
|
36 |
-
print(f"Detected language:{language_predicted}")
|
37 |
|
|
|
|
|
|
|
|
|
38 |
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
)
|
44 |
|
45 |
-
|
46 |
-
|
47 |
-
None,
|
48 |
-
None,
|
49 |
-
)
|
50 |
|
51 |
-
if
|
52 |
-
if
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
None,
|
59 |
)
|
60 |
-
style = 'en_' + style
|
61 |
-
|
62 |
-
else:
|
63 |
-
if style not in ['default']:
|
64 |
-
text_hint += f"[ERROR] The style {style} is not supported for Chinese, which should be in ['default']\n"
|
65 |
-
gr.Warning(f"The style {style} is not supported for Chinese, which should be in ['default']")
|
66 |
return (
|
67 |
text_hint,
|
68 |
None,
|
69 |
None,
|
70 |
)
|
71 |
-
style = 'cn_' + style
|
72 |
|
73 |
-
|
|
|
74 |
|
75 |
if len(prompt) < 2:
|
76 |
text_hint += f"[ERROR] Please give a longer prompt text \n"
|
@@ -80,10 +59,15 @@ def predict(prompt, style, audio_file_pth, agree):
|
|
80 |
None,
|
81 |
None,
|
82 |
)
|
83 |
-
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
85 |
gr.Warning(
|
86 |
-
"
|
87 |
)
|
88 |
return (
|
89 |
text_hint,
|
@@ -91,46 +75,30 @@ def predict(prompt, style, audio_file_pth, agree):
|
|
91 |
None,
|
92 |
)
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
save_path = f'{output_dir}/output.wav'
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
# Send the data as a POST request
|
104 |
-
response = requests.post(API_URL, json=data, timeout=60)
|
105 |
-
print(f'Get response successfully within {time.time() - start}')
|
106 |
|
107 |
-
# Check the response
|
108 |
-
if response.status_code == 200:
|
109 |
-
try:
|
110 |
-
json_data = json.loads(response.content)
|
111 |
-
text_hint += f"[ERROR] {json_data['error']} \n"
|
112 |
-
gr.Warning(
|
113 |
-
f"[ERROR] {json_data['error']} \n"
|
114 |
-
)
|
115 |
-
return (
|
116 |
-
text_hint,
|
117 |
-
None,
|
118 |
-
None,
|
119 |
-
)
|
120 |
-
except:
|
121 |
-
with open(save_path, 'wb') as f:
|
122 |
-
f.write(response.content)
|
123 |
-
else:
|
124 |
-
text_hint += f"[HTTP ERROR] {response.status_code} - {response.text} \n"
|
125 |
-
gr.Warning(
|
126 |
-
f"[HTTP ERROR] {response.status_code} - {response.text} \n"
|
127 |
-
)
|
128 |
-
return (
|
129 |
-
text_hint,
|
130 |
-
None,
|
131 |
-
None,
|
132 |
-
)
|
133 |
text_hint += f'''Get response successfully \n'''
|
|
|
134 |
return (
|
135 |
text_hint,
|
136 |
save_path,
|
@@ -138,84 +106,53 @@ def predict(prompt, style, audio_file_pth, agree):
|
|
138 |
)
|
139 |
|
140 |
|
141 |
-
title = "MyShell OpenVoice"
|
142 |
-
|
143 |
-
description = """
|
144 |
-
We introduce OpenVoice, a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set.
|
145 |
-
"""
|
146 |
-
|
147 |
-
markdown_table = """
|
148 |
-
<div align="center" style="margin-bottom: 10px;">
|
149 |
-
|
150 |
-
| | | |
|
151 |
-
| :-----------: | :-----------: | :-----------: |
|
152 |
-
| **OpenSource Repo** | **Project Page** | **Join the Community** |
|
153 |
-
| <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | [OpenVoice](https://research.myshell.ai/open-voice) | [](https://discord.gg/myshell) |
|
154 |
-
|
155 |
-
</div>
|
156 |
-
"""
|
157 |
-
|
158 |
-
markdown_table_v2 = """
|
159 |
-
<div align="center" style="margin-bottom: 2px;">
|
160 |
-
|
161 |
-
| | | | |
|
162 |
-
| :-----------: | :-----------: | :-----------: | :-----------: |
|
163 |
-
| **Github Repo** | <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | **Project Page** | [OpenVoice](https://research.myshell.ai/open-voice) |
|
164 |
-
|
165 |
-
| | |
|
166 |
-
| :-----------: | :-----------: |
|
167 |
-
**Join the Community** | [](https://discord.gg/myshell) |
|
168 |
-
|
169 |
-
</div>
|
170 |
-
"""
|
171 |
-
content = """
|
172 |
-
<div>
|
173 |
-
<strong>If the generated voice does not sound like the reference voice, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/QA.md'>this QnA</a>.</strong> <strong>For multi-lingual & cross-lingual examples, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/demo_part2.ipynb'>this jupyter notebook</a>.</strong>
|
174 |
-
This online demo mainly supports <strong>English</strong>. The <em>default</em> style also supports <strong>Chinese</strong>. But OpenVoice can adapt to any other language as long as a base speaker is provided.
|
175 |
-
</div>
|
176 |
-
"""
|
177 |
-
wrapped_markdown_content = f"<div style='border: 1px solid #000; padding: 10px;'>{content}</div>"
|
178 |
-
|
179 |
-
|
180 |
examples = [
|
181 |
[
|
182 |
"今天天气真好,我们一起出去吃饭吧。",
|
183 |
-
'default',
|
184 |
-
"examples/
|
185 |
-
|
186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
"This audio is generated by open voice with a half-performance model.",
|
188 |
-
'whispering',
|
189 |
-
"examples/
|
190 |
-
|
|
|
|
|
191 |
],
|
192 |
[
|
193 |
"He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
|
194 |
-
'sad',
|
195 |
-
"examples/
|
196 |
-
|
|
|
|
|
197 |
],
|
198 |
]
|
199 |
|
200 |
with gr.Blocks(analytics_enabled=False) as demo:
|
201 |
|
202 |
-
with gr.Row():
|
203 |
-
|
204 |
-
with gr.Row():
|
205 |
-
gr.Markdown(
|
206 |
-
"""
|
207 |
-
## <img src="https://huggingface.co/spaces/myshell-ai/OpenVoice/raw/main/logo.jpg" height="40"/>
|
208 |
-
"""
|
209 |
-
)
|
210 |
-
with gr.Row():
|
211 |
-
gr.Markdown(markdown_table_v2)
|
212 |
-
with gr.Row():
|
213 |
-
gr.Markdown(description)
|
214 |
-
with gr.Column():
|
215 |
-
gr.Video('./open_voice.mp4', autoplay=True)
|
216 |
-
|
217 |
-
with gr.Row():
|
218 |
-
gr.HTML(wrapped_markdown_content)
|
219 |
|
220 |
with gr.Row():
|
221 |
with gr.Column():
|
@@ -224,24 +161,35 @@ with gr.Blocks(analytics_enabled=False) as demo:
|
|
224 |
info="One or two sentences at a time is better. Up to 200 text characters.",
|
225 |
value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
|
226 |
)
|
227 |
-
style_gr = gr.Dropdown(
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
)
|
234 |
ref_gr = gr.Audio(
|
235 |
label="Reference Audio",
|
236 |
info="Click on the ✎ button to upload your own target speaker audio",
|
237 |
type="filepath",
|
238 |
-
value="examples/
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
)
|
240 |
-
|
241 |
-
label="
|
242 |
value=False,
|
243 |
-
info="
|
244 |
)
|
|
|
|
|
|
|
|
|
|
|
245 |
|
246 |
tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
|
247 |
|
@@ -253,11 +201,13 @@ with gr.Blocks(analytics_enabled=False) as demo:
|
|
253 |
|
254 |
gr.Examples(examples,
|
255 |
label="Examples",
|
256 |
-
inputs=[input_text_gr, style_gr, ref_gr,
|
|
|
257 |
outputs=[out_text_gr, audio_gr, ref_audio_gr],
|
258 |
fn=predict,
|
259 |
cache_examples=False,)
|
260 |
-
tts_button.click(predict, [input_text_gr, style_gr, ref_gr,
|
|
|
261 |
|
262 |
-
demo.queue(
|
263 |
-
demo.launch(debug=True, show_api=True)
|
|
|
1 |
import os
|
2 |
+
import torch
|
3 |
+
import argparse
|
4 |
import gradio as gr
|
5 |
+
#from zipfile import ZipFile
|
6 |
+
from melo.api import TTS
|
|
|
|
|
|
|
7 |
|
8 |
+
# Init EN/ZH baseTTS and ToneConvertor
|
9 |
+
from OpenVoice import se_extractor
|
10 |
+
from OpenVoice.api import ToneColorConverter
|
11 |
+
import devicetorch
|
12 |
+
device = devicetorch.get(torch)
|
13 |
+
ckpt_converter = 'checkpoints/converter'
|
14 |
+
tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
|
15 |
+
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
|
16 |
|
|
|
|
|
17 |
|
18 |
+
languages = ["EN", "ES", "FR", "ZH", "JP", "KR"]
|
19 |
+
en = ["EN-Default", "EN-US", "EN-BR", "EN_INDIA", "EN-AU"]
|
|
|
|
|
|
|
20 |
|
21 |
+
LANG = sys.argv[1].strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
|
|
|
|
|
|
23 |
|
24 |
+
#def predict(prompt, style, audio_file_pth, mic_file_path, use_mic, language):
|
25 |
+
def predict(prompt, audio_file_pth, mic_file_path, use_mic, language):
|
26 |
+
# initialize a empty info
|
27 |
+
text_hint = ''
|
28 |
|
29 |
+
lang_code = language
|
30 |
+
if language.startswith("EN"):
|
31 |
+
lang_code = "EN"
|
32 |
+
tts_model = TTS(language=lang_code, device=device)
|
|
|
33 |
|
34 |
+
speaker_key = language.lower().replace('_', '-')
|
35 |
+
source_se = torch.load(f'checkpoints/base_speakers/ses/{speaker_key}.pth', map_location=device)
|
|
|
|
|
|
|
36 |
|
37 |
+
if use_mic == True:
|
38 |
+
if mic_file_path is not None:
|
39 |
+
speaker_wav = mic_file_path
|
40 |
+
else:
|
41 |
+
text_hint += f"[ERROR] Please record your voice with Microphone, or uncheck Use Microphone to use reference audios\n"
|
42 |
+
gr.Warning(
|
43 |
+
"Please record your voice with Microphone, or uncheck Use Microphone to use reference audios"
|
|
|
44 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
return (
|
46 |
text_hint,
|
47 |
None,
|
48 |
None,
|
49 |
)
|
|
|
50 |
|
51 |
+
else:
|
52 |
+
speaker_wav = audio_file_pth
|
53 |
|
54 |
if len(prompt) < 2:
|
55 |
text_hint += f"[ERROR] Please give a longer prompt text \n"
|
|
|
59 |
None,
|
60 |
None,
|
61 |
)
|
62 |
+
|
63 |
+
# note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
|
64 |
+
try:
|
65 |
+
target_se, wavs_folder = se_extractor.get_se(speaker_wav, tone_color_converter, target_dir='processed', max_length=60., vad=True)
|
66 |
+
# os.system(f'rm -rf {wavs_folder}')
|
67 |
+
except Exception as e:
|
68 |
+
text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
|
69 |
gr.Warning(
|
70 |
+
"[ERROR] Get target tone color error {str(e)} \n"
|
71 |
)
|
72 |
return (
|
73 |
text_hint,
|
|
|
75 |
None,
|
76 |
)
|
77 |
|
78 |
+
output_dir = os.path.abspath("output")
|
79 |
+
src_path = f'{output_dir}/tmp.wav'
|
80 |
+
|
81 |
+
speed = 1.0
|
82 |
+
|
83 |
+
#tts_model.tts_to_file(prompt, speaker_id, src_path, speaker=style, language=language)
|
84 |
+
speaker_ids = tts_model.hps.data.spk2id
|
85 |
+
print(f"Speaker_ids= {speaker_ids}, language={language}, speaker_key={speaker_key}")
|
86 |
+
speaker_id = speaker_ids[language]
|
87 |
+
|
88 |
+
tts_model.tts_to_file(prompt, speaker_id, src_path)
|
89 |
+
|
90 |
save_path = f'{output_dir}/output.wav'
|
91 |
+
# Run the tone color converter
|
92 |
+
encode_message = "@MyShell"
|
93 |
+
tone_color_converter.convert(
|
94 |
+
audio_src_path=src_path,
|
95 |
+
src_se=source_se,
|
96 |
+
tgt_se=target_se,
|
97 |
+
output_path=save_path,
|
98 |
+
message=encode_message)
|
|
|
|
|
|
|
99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
text_hint += f'''Get response successfully \n'''
|
101 |
+
|
102 |
return (
|
103 |
text_hint,
|
104 |
save_path,
|
|
|
106 |
)
|
107 |
|
108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
examples = [
|
110 |
[
|
111 |
"今天天气真好,我们一起出去吃饭吧。",
|
112 |
+
# 'default',
|
113 |
+
"examples/speaker0.mp3",
|
114 |
+
None,
|
115 |
+
False,
|
116 |
+
"ZH",
|
117 |
+
],
|
118 |
+
[
|
119 |
+
"お前はもう死んでいる",
|
120 |
+
# 'default',
|
121 |
+
"examples/speaker0.mp3",
|
122 |
+
None,
|
123 |
+
False,
|
124 |
+
"JP",
|
125 |
+
],
|
126 |
+
[
|
127 |
+
"오빤 강남 스타일",
|
128 |
+
# 'default',
|
129 |
+
"examples/speaker0.mp3",
|
130 |
+
None,
|
131 |
+
False,
|
132 |
+
"KR",
|
133 |
+
],
|
134 |
+
[
|
135 |
"This audio is generated by open voice with a half-performance model.",
|
136 |
+
# 'whispering',
|
137 |
+
"examples/speaker1.mp3",
|
138 |
+
None,
|
139 |
+
False,
|
140 |
+
"EN-BR"
|
141 |
],
|
142 |
[
|
143 |
"He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
|
144 |
+
# 'sad',
|
145 |
+
"examples/speaker2.mp3",
|
146 |
+
None,
|
147 |
+
False,
|
148 |
+
"EN-BR"
|
149 |
],
|
150 |
]
|
151 |
|
152 |
with gr.Blocks(analytics_enabled=False) as demo:
|
153 |
|
154 |
+
# with gr.Row():
|
155 |
+
# gr.HTML(wrapped_markdown_content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
|
157 |
with gr.Row():
|
158 |
with gr.Column():
|
|
|
161 |
info="One or two sentences at a time is better. Up to 200 text characters.",
|
162 |
value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
|
163 |
)
|
164 |
+
#style_gr = gr.Dropdown(
|
165 |
+
# label="Style",
|
166 |
+
# info="Select a style of output audio for the synthesised speech. (Chinese only support 'default' now)",
|
167 |
+
# choices=['default', 'whispering', 'cheerful', 'terrified', 'angry', 'sad', 'friendly'],
|
168 |
+
# max_choices=1,
|
169 |
+
# value="default",
|
170 |
+
#)
|
171 |
ref_gr = gr.Audio(
|
172 |
label="Reference Audio",
|
173 |
info="Click on the ✎ button to upload your own target speaker audio",
|
174 |
type="filepath",
|
175 |
+
value="examples/speaker0.mp3",
|
176 |
+
)
|
177 |
+
mic_gr = gr.Audio(
|
178 |
+
source="microphone",
|
179 |
+
type="filepath",
|
180 |
+
info="Use your microphone to record audio",
|
181 |
+
label="Use Microphone for Reference",
|
182 |
)
|
183 |
+
use_mic_gr = gr.Checkbox(
|
184 |
+
label="Use Microphone",
|
185 |
value=False,
|
186 |
+
info="Notice: Microphone input may not work properly under traffic",
|
187 |
)
|
188 |
+
#language = gr.Radio(['EN-Newest', 'EN-US', 'EN-BR', 'EN_INDIA', 'EN-AU', 'EN-Default', 'ES', 'FR', 'ZH', 'JP', 'KR'], label='Language', value='EN-Newest')
|
189 |
+
if LANG == "EN":
|
190 |
+
language = gr.Radio(['EN-US', 'EN-BR', 'EN_INDIA', 'EN-AU', 'EN-Default'], label='Language', value='EN-US')
|
191 |
+
else:
|
192 |
+
language = gr.Radio([LANG], value=LANG)
|
193 |
|
194 |
tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
|
195 |
|
|
|
201 |
|
202 |
gr.Examples(examples,
|
203 |
label="Examples",
|
204 |
+
#inputs=[input_text_gr, style_gr, ref_gr, mic_gr, use_mic_gr, language],
|
205 |
+
inputs=[input_text_gr, ref_gr, mic_gr, use_mic_gr, language],
|
206 |
outputs=[out_text_gr, audio_gr, ref_audio_gr],
|
207 |
fn=predict,
|
208 |
cache_examples=False,)
|
209 |
+
#tts_button.click(predict, [input_text_gr, style_gr, ref_gr, mic_gr, use_mic_gr, language], outputs=[out_text_gr, audio_gr, ref_audio_gr])
|
210 |
+
tts_button.click(predict, [input_text_gr, ref_gr, mic_gr, use_mic_gr, language], outputs=[out_text_gr, audio_gr, ref_audio_gr])
|
211 |
|
212 |
+
demo.queue()
|
213 |
+
demo.launch(debug=True, show_api=True)
|
app_locally.py
DELETED
@@ -1,213 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import torch
|
3 |
-
import argparse
|
4 |
-
import gradio as gr
|
5 |
-
#from zipfile import ZipFile
|
6 |
-
from melo.api import TTS
|
7 |
-
|
8 |
-
# Init EN/ZH baseTTS and ToneConvertor
|
9 |
-
from OpenVoice import se_extractor
|
10 |
-
from OpenVoice.api import ToneColorConverter
|
11 |
-
import devicetorch
|
12 |
-
device = devicetorch.get(torch)
|
13 |
-
ckpt_converter = 'checkpoints/converter'
|
14 |
-
tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
|
15 |
-
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
|
16 |
-
|
17 |
-
|
18 |
-
languages = ["EN", "ES", "FR", "ZH", "JP", "KR"]
|
19 |
-
en = ["EN-Default", "EN-US", "EN-BR", "EN_INDIA", "EN-AU"]
|
20 |
-
|
21 |
-
LANG = sys.argv[1].strip()
|
22 |
-
|
23 |
-
|
24 |
-
#def predict(prompt, style, audio_file_pth, mic_file_path, use_mic, language):
|
25 |
-
def predict(prompt, audio_file_pth, mic_file_path, use_mic, language):
|
26 |
-
# initialize a empty info
|
27 |
-
text_hint = ''
|
28 |
-
|
29 |
-
lang_code = language
|
30 |
-
if language.startswith("EN"):
|
31 |
-
lang_code = "EN"
|
32 |
-
tts_model = TTS(language=lang_code, device=device)
|
33 |
-
|
34 |
-
speaker_key = language.lower().replace('_', '-')
|
35 |
-
source_se = torch.load(f'checkpoints/base_speakers/ses/{speaker_key}.pth', map_location=device)
|
36 |
-
|
37 |
-
if use_mic == True:
|
38 |
-
if mic_file_path is not None:
|
39 |
-
speaker_wav = mic_file_path
|
40 |
-
else:
|
41 |
-
text_hint += f"[ERROR] Please record your voice with Microphone, or uncheck Use Microphone to use reference audios\n"
|
42 |
-
gr.Warning(
|
43 |
-
"Please record your voice with Microphone, or uncheck Use Microphone to use reference audios"
|
44 |
-
)
|
45 |
-
return (
|
46 |
-
text_hint,
|
47 |
-
None,
|
48 |
-
None,
|
49 |
-
)
|
50 |
-
|
51 |
-
else:
|
52 |
-
speaker_wav = audio_file_pth
|
53 |
-
|
54 |
-
if len(prompt) < 2:
|
55 |
-
text_hint += f"[ERROR] Please give a longer prompt text \n"
|
56 |
-
gr.Warning("Please give a longer prompt text")
|
57 |
-
return (
|
58 |
-
text_hint,
|
59 |
-
None,
|
60 |
-
None,
|
61 |
-
)
|
62 |
-
|
63 |
-
# note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
|
64 |
-
try:
|
65 |
-
target_se, wavs_folder = se_extractor.get_se(speaker_wav, tone_color_converter, target_dir='processed', max_length=60., vad=True)
|
66 |
-
# os.system(f'rm -rf {wavs_folder}')
|
67 |
-
except Exception as e:
|
68 |
-
text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
|
69 |
-
gr.Warning(
|
70 |
-
"[ERROR] Get target tone color error {str(e)} \n"
|
71 |
-
)
|
72 |
-
return (
|
73 |
-
text_hint,
|
74 |
-
None,
|
75 |
-
None,
|
76 |
-
)
|
77 |
-
|
78 |
-
output_dir = os.path.abspath("output")
|
79 |
-
src_path = f'{output_dir}/tmp.wav'
|
80 |
-
|
81 |
-
speed = 1.0
|
82 |
-
|
83 |
-
#tts_model.tts_to_file(prompt, speaker_id, src_path, speaker=style, language=language)
|
84 |
-
speaker_ids = tts_model.hps.data.spk2id
|
85 |
-
print(f"Speaker_ids= {speaker_ids}, language={language}, speaker_key={speaker_key}")
|
86 |
-
speaker_id = speaker_ids[language]
|
87 |
-
|
88 |
-
tts_model.tts_to_file(prompt, speaker_id, src_path)
|
89 |
-
|
90 |
-
save_path = f'{output_dir}/output.wav'
|
91 |
-
# Run the tone color converter
|
92 |
-
encode_message = "@MyShell"
|
93 |
-
tone_color_converter.convert(
|
94 |
-
audio_src_path=src_path,
|
95 |
-
src_se=source_se,
|
96 |
-
tgt_se=target_se,
|
97 |
-
output_path=save_path,
|
98 |
-
message=encode_message)
|
99 |
-
|
100 |
-
text_hint += f'''Get response successfully \n'''
|
101 |
-
|
102 |
-
return (
|
103 |
-
text_hint,
|
104 |
-
save_path,
|
105 |
-
speaker_wav,
|
106 |
-
)
|
107 |
-
|
108 |
-
|
109 |
-
examples = [
|
110 |
-
[
|
111 |
-
"今天天气真好,我们一起出去吃饭吧。",
|
112 |
-
# 'default',
|
113 |
-
"examples/speaker0.mp3",
|
114 |
-
None,
|
115 |
-
False,
|
116 |
-
"ZH",
|
117 |
-
],
|
118 |
-
[
|
119 |
-
"お前はもう死んでいる",
|
120 |
-
# 'default',
|
121 |
-
"examples/speaker0.mp3",
|
122 |
-
None,
|
123 |
-
False,
|
124 |
-
"JP",
|
125 |
-
],
|
126 |
-
[
|
127 |
-
"오빤 강남 스타일",
|
128 |
-
# 'default',
|
129 |
-
"examples/speaker0.mp3",
|
130 |
-
None,
|
131 |
-
False,
|
132 |
-
"KR",
|
133 |
-
],
|
134 |
-
[
|
135 |
-
"This audio is generated by open voice with a half-performance model.",
|
136 |
-
# 'whispering',
|
137 |
-
"examples/speaker1.mp3",
|
138 |
-
None,
|
139 |
-
False,
|
140 |
-
"EN-BR"
|
141 |
-
],
|
142 |
-
[
|
143 |
-
"He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
|
144 |
-
# 'sad',
|
145 |
-
"examples/speaker2.mp3",
|
146 |
-
None,
|
147 |
-
False,
|
148 |
-
"EN-BR"
|
149 |
-
],
|
150 |
-
]
|
151 |
-
|
152 |
-
with gr.Blocks(analytics_enabled=False) as demo:
|
153 |
-
|
154 |
-
# with gr.Row():
|
155 |
-
# gr.HTML(wrapped_markdown_content)
|
156 |
-
|
157 |
-
with gr.Row():
|
158 |
-
with gr.Column():
|
159 |
-
input_text_gr = gr.Textbox(
|
160 |
-
label="Text Prompt",
|
161 |
-
info="One or two sentences at a time is better. Up to 200 text characters.",
|
162 |
-
value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
|
163 |
-
)
|
164 |
-
#style_gr = gr.Dropdown(
|
165 |
-
# label="Style",
|
166 |
-
# info="Select a style of output audio for the synthesised speech. (Chinese only support 'default' now)",
|
167 |
-
# choices=['default', 'whispering', 'cheerful', 'terrified', 'angry', 'sad', 'friendly'],
|
168 |
-
# max_choices=1,
|
169 |
-
# value="default",
|
170 |
-
#)
|
171 |
-
ref_gr = gr.Audio(
|
172 |
-
label="Reference Audio",
|
173 |
-
info="Click on the ✎ button to upload your own target speaker audio",
|
174 |
-
type="filepath",
|
175 |
-
value="examples/speaker0.mp3",
|
176 |
-
)
|
177 |
-
mic_gr = gr.Audio(
|
178 |
-
source="microphone",
|
179 |
-
type="filepath",
|
180 |
-
info="Use your microphone to record audio",
|
181 |
-
label="Use Microphone for Reference",
|
182 |
-
)
|
183 |
-
use_mic_gr = gr.Checkbox(
|
184 |
-
label="Use Microphone",
|
185 |
-
value=False,
|
186 |
-
info="Notice: Microphone input may not work properly under traffic",
|
187 |
-
)
|
188 |
-
#language = gr.Radio(['EN-Newest', 'EN-US', 'EN-BR', 'EN_INDIA', 'EN-AU', 'EN-Default', 'ES', 'FR', 'ZH', 'JP', 'KR'], label='Language', value='EN-Newest')
|
189 |
-
if LANG == "EN":
|
190 |
-
language = gr.Radio(['EN-US', 'EN-BR', 'EN_INDIA', 'EN-AU', 'EN-Default'], label='Language', value='EN-US')
|
191 |
-
else:
|
192 |
-
language = gr.Radio([LANG], value=LANG)
|
193 |
-
|
194 |
-
tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
|
195 |
-
|
196 |
-
|
197 |
-
with gr.Column():
|
198 |
-
out_text_gr = gr.Text(label="Info")
|
199 |
-
audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
|
200 |
-
ref_audio_gr = gr.Audio(label="Reference Audio Used")
|
201 |
-
|
202 |
-
gr.Examples(examples,
|
203 |
-
label="Examples",
|
204 |
-
#inputs=[input_text_gr, style_gr, ref_gr, mic_gr, use_mic_gr, language],
|
205 |
-
inputs=[input_text_gr, ref_gr, mic_gr, use_mic_gr, language],
|
206 |
-
outputs=[out_text_gr, audio_gr, ref_audio_gr],
|
207 |
-
fn=predict,
|
208 |
-
cache_examples=False,)
|
209 |
-
#tts_button.click(predict, [input_text_gr, style_gr, ref_gr, mic_gr, use_mic_gr, language], outputs=[out_text_gr, audio_gr, ref_audio_gr])
|
210 |
-
tts_button.click(predict, [input_text_gr, ref_gr, mic_gr, use_mic_gr, language], outputs=[out_text_gr, audio_gr, ref_audio_gr])
|
211 |
-
|
212 |
-
demo.queue()
|
213 |
-
demo.launch(debug=True, show_api=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|