cocktailpeanut commited on
Commit
54e4063
·
1 Parent(s): d8bb336
Files changed (2) hide show
  1. app.py +126 -176
  2. app_locally.py +0 -213
app.py CHANGED
@@ -1,76 +1,55 @@
1
  import os
 
 
2
  import gradio as gr
3
- import requests
4
- import langid
5
- import base64
6
- import json
7
- import time
8
 
9
- API_URL = os.environ.get("API_URL")
10
- supported_languages = ['zh', 'en']
 
 
 
 
 
 
11
 
12
- output_dir = 'outputs'
13
- os.makedirs(output_dir, exist_ok=True)
14
 
15
- def audio_to_base64(audio_file):
16
- with open(audio_file, "rb") as audio_file:
17
- audio_data = audio_file.read()
18
- base64_data = base64.b64encode(audio_data).decode("utf-8")
19
- return base64_data
20
 
21
- def predict(prompt, style, audio_file_pth, agree):
22
- # initialize a empty info
23
- text_hint = ''
24
- # agree with the terms
25
- if agree == False:
26
- text_hint += '[ERROR] Please accept the Terms & Condition!\n'
27
- gr.Warning("Please accept the Terms & Condition!")
28
- return (
29
- text_hint,
30
- None,
31
- None,
32
- )
33
 
34
- # first detect the input language
35
- language_predicted = langid.classify(prompt)[0].strip()
36
- print(f"Detected language:{language_predicted}")
37
 
 
 
 
 
38
 
39
- if language_predicted not in supported_languages:
40
- text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n"
41
- gr.Warning(
42
- f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}"
43
- )
44
 
45
- return (
46
- text_hint,
47
- None,
48
- None,
49
- )
50
 
51
- if language_predicted == "en":
52
- if style not in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']:
53
- text_hint += f"[ERROR] The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']\n"
54
- gr.Warning(f"The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']")
55
- return (
56
- text_hint,
57
- None,
58
- None,
59
  )
60
- style = 'en_' + style
61
-
62
- else:
63
- if style not in ['default']:
64
- text_hint += f"[ERROR] The style {style} is not supported for Chinese, which should be in ['default']\n"
65
- gr.Warning(f"The style {style} is not supported for Chinese, which should be in ['default']")
66
  return (
67
  text_hint,
68
  None,
69
  None,
70
  )
71
- style = 'cn_' + style
72
 
73
- speaker_wav = audio_file_pth
 
74
 
75
  if len(prompt) < 2:
76
  text_hint += f"[ERROR] Please give a longer prompt text \n"
@@ -80,10 +59,15 @@ def predict(prompt, style, audio_file_pth, agree):
80
  None,
81
  None,
82
  )
83
- if len(prompt) > 200:
84
- text_hint += f"[ERROR] Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo and try for your usage \n"
 
 
 
 
 
85
  gr.Warning(
86
- "Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo for your usage"
87
  )
88
  return (
89
  text_hint,
@@ -91,46 +75,30 @@ def predict(prompt, style, audio_file_pth, agree):
91
  None,
92
  )
93
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  save_path = f'{output_dir}/output.wav'
95
- speaker_audio_base64 = audio_to_base64(speaker_wav)
96
- data = {
97
- "text": prompt,
98
- "reference_speaker": speaker_audio_base64,
99
- "emotion": style
100
- }
101
-
102
- start = time.time()
103
- # Send the data as a POST request
104
- response = requests.post(API_URL, json=data, timeout=60)
105
- print(f'Get response successfully within {time.time() - start}')
106
 
107
- # Check the response
108
- if response.status_code == 200:
109
- try:
110
- json_data = json.loads(response.content)
111
- text_hint += f"[ERROR] {json_data['error']} \n"
112
- gr.Warning(
113
- f"[ERROR] {json_data['error']} \n"
114
- )
115
- return (
116
- text_hint,
117
- None,
118
- None,
119
- )
120
- except:
121
- with open(save_path, 'wb') as f:
122
- f.write(response.content)
123
- else:
124
- text_hint += f"[HTTP ERROR] {response.status_code} - {response.text} \n"
125
- gr.Warning(
126
- f"[HTTP ERROR] {response.status_code} - {response.text} \n"
127
- )
128
- return (
129
- text_hint,
130
- None,
131
- None,
132
- )
133
  text_hint += f'''Get response successfully \n'''
 
134
  return (
135
  text_hint,
136
  save_path,
@@ -138,84 +106,53 @@ def predict(prompt, style, audio_file_pth, agree):
138
  )
139
 
140
 
141
- title = "MyShell OpenVoice"
142
-
143
- description = """
144
- We introduce OpenVoice, a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set.
145
- """
146
-
147
- markdown_table = """
148
- <div align="center" style="margin-bottom: 10px;">
149
-
150
- | | | |
151
- | :-----------: | :-----------: | :-----------: |
152
- | **OpenSource Repo** | **Project Page** | **Join the Community** |
153
- | <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | [OpenVoice](https://research.myshell.ai/open-voice) | [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |
154
-
155
- </div>
156
- """
157
-
158
- markdown_table_v2 = """
159
- <div align="center" style="margin-bottom: 2px;">
160
-
161
- | | | | |
162
- | :-----------: | :-----------: | :-----------: | :-----------: |
163
- | **Github Repo** | <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | **Project Page** | [OpenVoice](https://research.myshell.ai/open-voice) |
164
-
165
- | | |
166
- | :-----------: | :-----------: |
167
- **Join the Community** | [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |
168
-
169
- </div>
170
- """
171
- content = """
172
- <div>
173
- <strong>If the generated voice does not sound like the reference voice, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/QA.md'>this QnA</a>.</strong> <strong>For multi-lingual & cross-lingual examples, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/demo_part2.ipynb'>this jupyter notebook</a>.</strong>
174
- This online demo mainly supports <strong>English</strong>. The <em>default</em> style also supports <strong>Chinese</strong>. But OpenVoice can adapt to any other language as long as a base speaker is provided.
175
- </div>
176
- """
177
- wrapped_markdown_content = f"<div style='border: 1px solid #000; padding: 10px;'>{content}</div>"
178
-
179
-
180
  examples = [
181
  [
182
  "今天天气真好,我们一起出去吃饭吧。",
183
- 'default',
184
- "examples/speaker1.mp3",
185
- True,
186
- ],[
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  "This audio is generated by open voice with a half-performance model.",
188
- 'whispering',
189
- "examples/speaker2.mp3",
190
- True,
 
 
191
  ],
192
  [
193
  "He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
194
- 'sad',
195
- "examples/speaker0.mp3",
196
- True,
 
 
197
  ],
198
  ]
199
 
200
  with gr.Blocks(analytics_enabled=False) as demo:
201
 
202
- with gr.Row():
203
- with gr.Column():
204
- with gr.Row():
205
- gr.Markdown(
206
- """
207
- ## <img src="https://huggingface.co/spaces/myshell-ai/OpenVoice/raw/main/logo.jpg" height="40"/>
208
- """
209
- )
210
- with gr.Row():
211
- gr.Markdown(markdown_table_v2)
212
- with gr.Row():
213
- gr.Markdown(description)
214
- with gr.Column():
215
- gr.Video('./open_voice.mp4', autoplay=True)
216
-
217
- with gr.Row():
218
- gr.HTML(wrapped_markdown_content)
219
 
220
  with gr.Row():
221
  with gr.Column():
@@ -224,24 +161,35 @@ with gr.Blocks(analytics_enabled=False) as demo:
224
  info="One or two sentences at a time is better. Up to 200 text characters.",
225
  value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
226
  )
227
- style_gr = gr.Dropdown(
228
- label="Style",
229
- info="Select a style of output audio for the synthesised speech. (Chinese only support 'default' now)",
230
- choices=['default', 'whispering', 'cheerful', 'terrified', 'angry', 'sad', 'friendly'],
231
- max_choices=1,
232
- value="default",
233
- )
234
  ref_gr = gr.Audio(
235
  label="Reference Audio",
236
  info="Click on the ✎ button to upload your own target speaker audio",
237
  type="filepath",
238
- value="examples/speaker2.mp3",
 
 
 
 
 
 
239
  )
240
- tos_gr = gr.Checkbox(
241
- label="Agree",
242
  value=False,
243
- info="I agree to the terms of the cc-by-nc-4.0 license-: https://github.com/myshell-ai/OpenVoice/blob/main/LICENSE",
244
  )
 
 
 
 
 
245
 
246
  tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
247
 
@@ -253,11 +201,13 @@ with gr.Blocks(analytics_enabled=False) as demo:
253
 
254
  gr.Examples(examples,
255
  label="Examples",
256
- inputs=[input_text_gr, style_gr, ref_gr, tos_gr],
 
257
  outputs=[out_text_gr, audio_gr, ref_audio_gr],
258
  fn=predict,
259
  cache_examples=False,)
260
- tts_button.click(predict, [input_text_gr, style_gr, ref_gr, tos_gr], outputs=[out_text_gr, audio_gr, ref_audio_gr])
 
261
 
262
- demo.queue(concurrency_count=6)
263
- demo.launch(debug=True, show_api=True)
 
1
  import os
2
+ import torch
3
+ import argparse
4
  import gradio as gr
5
+ #from zipfile import ZipFile
6
+ from melo.api import TTS
 
 
 
7
 
8
+ # Init EN/ZH baseTTS and ToneConvertor
9
+ from OpenVoice import se_extractor
10
+ from OpenVoice.api import ToneColorConverter
11
+ import devicetorch
12
+ device = devicetorch.get(torch)
13
+ ckpt_converter = 'checkpoints/converter'
14
+ tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
15
+ tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
16
 
 
 
17
 
18
+ languages = ["EN", "ES", "FR", "ZH", "JP", "KR"]
19
+ en = ["EN-Default", "EN-US", "EN-BR", "EN_INDIA", "EN-AU"]
 
 
 
20
 
21
+ LANG = sys.argv[1].strip()
 
 
 
 
 
 
 
 
 
 
 
22
 
 
 
 
23
 
24
+ #def predict(prompt, style, audio_file_pth, mic_file_path, use_mic, language):
25
+ def predict(prompt, audio_file_pth, mic_file_path, use_mic, language):
26
+ # initialize a empty info
27
+ text_hint = ''
28
 
29
+ lang_code = language
30
+ if language.startswith("EN"):
31
+ lang_code = "EN"
32
+ tts_model = TTS(language=lang_code, device=device)
 
33
 
34
+ speaker_key = language.lower().replace('_', '-')
35
+ source_se = torch.load(f'checkpoints/base_speakers/ses/{speaker_key}.pth', map_location=device)
 
 
 
36
 
37
+ if use_mic == True:
38
+ if mic_file_path is not None:
39
+ speaker_wav = mic_file_path
40
+ else:
41
+ text_hint += f"[ERROR] Please record your voice with Microphone, or uncheck Use Microphone to use reference audios\n"
42
+ gr.Warning(
43
+ "Please record your voice with Microphone, or uncheck Use Microphone to use reference audios"
 
44
  )
 
 
 
 
 
 
45
  return (
46
  text_hint,
47
  None,
48
  None,
49
  )
 
50
 
51
+ else:
52
+ speaker_wav = audio_file_pth
53
 
54
  if len(prompt) < 2:
55
  text_hint += f"[ERROR] Please give a longer prompt text \n"
 
59
  None,
60
  None,
61
  )
62
+
63
+ # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
64
+ try:
65
+ target_se, wavs_folder = se_extractor.get_se(speaker_wav, tone_color_converter, target_dir='processed', max_length=60., vad=True)
66
+ # os.system(f'rm -rf {wavs_folder}')
67
+ except Exception as e:
68
+ text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
69
  gr.Warning(
70
+ "[ERROR] Get target tone color error {str(e)} \n"
71
  )
72
  return (
73
  text_hint,
 
75
  None,
76
  )
77
 
78
+ output_dir = os.path.abspath("output")
79
+ src_path = f'{output_dir}/tmp.wav'
80
+
81
+ speed = 1.0
82
+
83
+ #tts_model.tts_to_file(prompt, speaker_id, src_path, speaker=style, language=language)
84
+ speaker_ids = tts_model.hps.data.spk2id
85
+ print(f"Speaker_ids= {speaker_ids}, language={language}, speaker_key={speaker_key}")
86
+ speaker_id = speaker_ids[language]
87
+
88
+ tts_model.tts_to_file(prompt, speaker_id, src_path)
89
+
90
  save_path = f'{output_dir}/output.wav'
91
+ # Run the tone color converter
92
+ encode_message = "@MyShell"
93
+ tone_color_converter.convert(
94
+ audio_src_path=src_path,
95
+ src_se=source_se,
96
+ tgt_se=target_se,
97
+ output_path=save_path,
98
+ message=encode_message)
 
 
 
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  text_hint += f'''Get response successfully \n'''
101
+
102
  return (
103
  text_hint,
104
  save_path,
 
106
  )
107
 
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  examples = [
110
  [
111
  "今天天气真好,我们一起出去吃饭吧。",
112
+ # 'default',
113
+ "examples/speaker0.mp3",
114
+ None,
115
+ False,
116
+ "ZH",
117
+ ],
118
+ [
119
+ "お前はもう死んでいる",
120
+ # 'default',
121
+ "examples/speaker0.mp3",
122
+ None,
123
+ False,
124
+ "JP",
125
+ ],
126
+ [
127
+ "오빤 강남 스타일",
128
+ # 'default',
129
+ "examples/speaker0.mp3",
130
+ None,
131
+ False,
132
+ "KR",
133
+ ],
134
+ [
135
  "This audio is generated by open voice with a half-performance model.",
136
+ # 'whispering',
137
+ "examples/speaker1.mp3",
138
+ None,
139
+ False,
140
+ "EN-BR"
141
  ],
142
  [
143
  "He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
144
+ # 'sad',
145
+ "examples/speaker2.mp3",
146
+ None,
147
+ False,
148
+ "EN-BR"
149
  ],
150
  ]
151
 
152
  with gr.Blocks(analytics_enabled=False) as demo:
153
 
154
+ # with gr.Row():
155
+ # gr.HTML(wrapped_markdown_content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
  with gr.Row():
158
  with gr.Column():
 
161
  info="One or two sentences at a time is better. Up to 200 text characters.",
162
  value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
163
  )
164
+ #style_gr = gr.Dropdown(
165
+ # label="Style",
166
+ # info="Select a style of output audio for the synthesised speech. (Chinese only support 'default' now)",
167
+ # choices=['default', 'whispering', 'cheerful', 'terrified', 'angry', 'sad', 'friendly'],
168
+ # max_choices=1,
169
+ # value="default",
170
+ #)
171
  ref_gr = gr.Audio(
172
  label="Reference Audio",
173
  info="Click on the ✎ button to upload your own target speaker audio",
174
  type="filepath",
175
+ value="examples/speaker0.mp3",
176
+ )
177
+ mic_gr = gr.Audio(
178
+ source="microphone",
179
+ type="filepath",
180
+ info="Use your microphone to record audio",
181
+ label="Use Microphone for Reference",
182
  )
183
+ use_mic_gr = gr.Checkbox(
184
+ label="Use Microphone",
185
  value=False,
186
+ info="Notice: Microphone input may not work properly under traffic",
187
  )
188
+ #language = gr.Radio(['EN-Newest', 'EN-US', 'EN-BR', 'EN_INDIA', 'EN-AU', 'EN-Default', 'ES', 'FR', 'ZH', 'JP', 'KR'], label='Language', value='EN-Newest')
189
+ if LANG == "EN":
190
+ language = gr.Radio(['EN-US', 'EN-BR', 'EN_INDIA', 'EN-AU', 'EN-Default'], label='Language', value='EN-US')
191
+ else:
192
+ language = gr.Radio([LANG], value=LANG)
193
 
194
  tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
195
 
 
201
 
202
  gr.Examples(examples,
203
  label="Examples",
204
+ #inputs=[input_text_gr, style_gr, ref_gr, mic_gr, use_mic_gr, language],
205
+ inputs=[input_text_gr, ref_gr, mic_gr, use_mic_gr, language],
206
  outputs=[out_text_gr, audio_gr, ref_audio_gr],
207
  fn=predict,
208
  cache_examples=False,)
209
+ #tts_button.click(predict, [input_text_gr, style_gr, ref_gr, mic_gr, use_mic_gr, language], outputs=[out_text_gr, audio_gr, ref_audio_gr])
210
+ tts_button.click(predict, [input_text_gr, ref_gr, mic_gr, use_mic_gr, language], outputs=[out_text_gr, audio_gr, ref_audio_gr])
211
 
212
+ demo.queue()
213
+ demo.launch(debug=True, show_api=True)
app_locally.py DELETED
@@ -1,213 +0,0 @@
1
- import os
2
- import torch
3
- import argparse
4
- import gradio as gr
5
- #from zipfile import ZipFile
6
- from melo.api import TTS
7
-
8
- # Init EN/ZH baseTTS and ToneConvertor
9
- from OpenVoice import se_extractor
10
- from OpenVoice.api import ToneColorConverter
11
- import devicetorch
12
- device = devicetorch.get(torch)
13
- ckpt_converter = 'checkpoints/converter'
14
- tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
15
- tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
16
-
17
-
18
- languages = ["EN", "ES", "FR", "ZH", "JP", "KR"]
19
- en = ["EN-Default", "EN-US", "EN-BR", "EN_INDIA", "EN-AU"]
20
-
21
- LANG = sys.argv[1].strip()
22
-
23
-
24
- #def predict(prompt, style, audio_file_pth, mic_file_path, use_mic, language):
25
- def predict(prompt, audio_file_pth, mic_file_path, use_mic, language):
26
- # initialize a empty info
27
- text_hint = ''
28
-
29
- lang_code = language
30
- if language.startswith("EN"):
31
- lang_code = "EN"
32
- tts_model = TTS(language=lang_code, device=device)
33
-
34
- speaker_key = language.lower().replace('_', '-')
35
- source_se = torch.load(f'checkpoints/base_speakers/ses/{speaker_key}.pth', map_location=device)
36
-
37
- if use_mic == True:
38
- if mic_file_path is not None:
39
- speaker_wav = mic_file_path
40
- else:
41
- text_hint += f"[ERROR] Please record your voice with Microphone, or uncheck Use Microphone to use reference audios\n"
42
- gr.Warning(
43
- "Please record your voice with Microphone, or uncheck Use Microphone to use reference audios"
44
- )
45
- return (
46
- text_hint,
47
- None,
48
- None,
49
- )
50
-
51
- else:
52
- speaker_wav = audio_file_pth
53
-
54
- if len(prompt) < 2:
55
- text_hint += f"[ERROR] Please give a longer prompt text \n"
56
- gr.Warning("Please give a longer prompt text")
57
- return (
58
- text_hint,
59
- None,
60
- None,
61
- )
62
-
63
- # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
64
- try:
65
- target_se, wavs_folder = se_extractor.get_se(speaker_wav, tone_color_converter, target_dir='processed', max_length=60., vad=True)
66
- # os.system(f'rm -rf {wavs_folder}')
67
- except Exception as e:
68
- text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
69
- gr.Warning(
70
- "[ERROR] Get target tone color error {str(e)} \n"
71
- )
72
- return (
73
- text_hint,
74
- None,
75
- None,
76
- )
77
-
78
- output_dir = os.path.abspath("output")
79
- src_path = f'{output_dir}/tmp.wav'
80
-
81
- speed = 1.0
82
-
83
- #tts_model.tts_to_file(prompt, speaker_id, src_path, speaker=style, language=language)
84
- speaker_ids = tts_model.hps.data.spk2id
85
- print(f"Speaker_ids= {speaker_ids}, language={language}, speaker_key={speaker_key}")
86
- speaker_id = speaker_ids[language]
87
-
88
- tts_model.tts_to_file(prompt, speaker_id, src_path)
89
-
90
- save_path = f'{output_dir}/output.wav'
91
- # Run the tone color converter
92
- encode_message = "@MyShell"
93
- tone_color_converter.convert(
94
- audio_src_path=src_path,
95
- src_se=source_se,
96
- tgt_se=target_se,
97
- output_path=save_path,
98
- message=encode_message)
99
-
100
- text_hint += f'''Get response successfully \n'''
101
-
102
- return (
103
- text_hint,
104
- save_path,
105
- speaker_wav,
106
- )
107
-
108
-
109
- examples = [
110
- [
111
- "今天天气真好,我们一起出去吃饭吧。",
112
- # 'default',
113
- "examples/speaker0.mp3",
114
- None,
115
- False,
116
- "ZH",
117
- ],
118
- [
119
- "お前はもう死んでいる",
120
- # 'default',
121
- "examples/speaker0.mp3",
122
- None,
123
- False,
124
- "JP",
125
- ],
126
- [
127
- "오빤 강남 스타일",
128
- # 'default',
129
- "examples/speaker0.mp3",
130
- None,
131
- False,
132
- "KR",
133
- ],
134
- [
135
- "This audio is generated by open voice with a half-performance model.",
136
- # 'whispering',
137
- "examples/speaker1.mp3",
138
- None,
139
- False,
140
- "EN-BR"
141
- ],
142
- [
143
- "He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
144
- # 'sad',
145
- "examples/speaker2.mp3",
146
- None,
147
- False,
148
- "EN-BR"
149
- ],
150
- ]
151
-
152
- with gr.Blocks(analytics_enabled=False) as demo:
153
-
154
- # with gr.Row():
155
- # gr.HTML(wrapped_markdown_content)
156
-
157
- with gr.Row():
158
- with gr.Column():
159
- input_text_gr = gr.Textbox(
160
- label="Text Prompt",
161
- info="One or two sentences at a time is better. Up to 200 text characters.",
162
- value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
163
- )
164
- #style_gr = gr.Dropdown(
165
- # label="Style",
166
- # info="Select a style of output audio for the synthesised speech. (Chinese only support 'default' now)",
167
- # choices=['default', 'whispering', 'cheerful', 'terrified', 'angry', 'sad', 'friendly'],
168
- # max_choices=1,
169
- # value="default",
170
- #)
171
- ref_gr = gr.Audio(
172
- label="Reference Audio",
173
- info="Click on the ✎ button to upload your own target speaker audio",
174
- type="filepath",
175
- value="examples/speaker0.mp3",
176
- )
177
- mic_gr = gr.Audio(
178
- source="microphone",
179
- type="filepath",
180
- info="Use your microphone to record audio",
181
- label="Use Microphone for Reference",
182
- )
183
- use_mic_gr = gr.Checkbox(
184
- label="Use Microphone",
185
- value=False,
186
- info="Notice: Microphone input may not work properly under traffic",
187
- )
188
- #language = gr.Radio(['EN-Newest', 'EN-US', 'EN-BR', 'EN_INDIA', 'EN-AU', 'EN-Default', 'ES', 'FR', 'ZH', 'JP', 'KR'], label='Language', value='EN-Newest')
189
- if LANG == "EN":
190
- language = gr.Radio(['EN-US', 'EN-BR', 'EN_INDIA', 'EN-AU', 'EN-Default'], label='Language', value='EN-US')
191
- else:
192
- language = gr.Radio([LANG], value=LANG)
193
-
194
- tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
195
-
196
-
197
- with gr.Column():
198
- out_text_gr = gr.Text(label="Info")
199
- audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
200
- ref_audio_gr = gr.Audio(label="Reference Audio Used")
201
-
202
- gr.Examples(examples,
203
- label="Examples",
204
- #inputs=[input_text_gr, style_gr, ref_gr, mic_gr, use_mic_gr, language],
205
- inputs=[input_text_gr, ref_gr, mic_gr, use_mic_gr, language],
206
- outputs=[out_text_gr, audio_gr, ref_audio_gr],
207
- fn=predict,
208
- cache_examples=False,)
209
- #tts_button.click(predict, [input_text_gr, style_gr, ref_gr, mic_gr, use_mic_gr, language], outputs=[out_text_gr, audio_gr, ref_audio_gr])
210
- tts_button.click(predict, [input_text_gr, ref_gr, mic_gr, use_mic_gr, language], outputs=[out_text_gr, audio_gr, ref_audio_gr])
211
-
212
- demo.queue()
213
- demo.launch(debug=True, show_api=True)