JacobLinCool commited on
Commit
671d69d
·
verified ·
1 Parent(s): 71d3f58

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -95
app.py CHANGED
@@ -66,13 +66,7 @@ def postprocess(speech, top_db=60, hop_length=220, win_length=440):
66
  return speech
67
 
68
  @spaces.GPU
69
- def generate_audio(tts_text, prompt_text, prompt_wav_upload, prompt_wav_record, seed, select_which):
70
- if select_which == "上傳檔案" and prompt_wav_upload is not None:
71
- prompt_wav = prompt_wav_upload
72
- elif select_which == "麥克風" and prompt_wav_record is not None:
73
- prompt_wav = prompt_wav_record
74
- else:
75
- prompt_wav = None
76
  # if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
77
 
78
  prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
@@ -94,75 +88,34 @@ def generate_audio(tts_text, prompt_text, prompt_wav_upload, prompt_wav_record,
94
 
95
 
96
  @spaces.GPU
97
- def generate_text(prompt_wav_upload, prompt_wav_record, select_which):
98
- # Determine which input to use based on the selection in select_which
99
- if select_which == "上傳檔案" and prompt_wav_upload is not None:
100
- prompt_wav = prompt_wav_upload
101
- LAST_UPLOADED = "upload"
102
- elif select_which == "麥克風" and prompt_wav_record is not None:
103
- prompt_wav = prompt_wav_record
104
- LAST_UPLOADED = "record"
105
- else:
106
- prompt_wav = None
107
- LAST_UPLOADED = None
108
- print(select_which)
109
- # Process with ASR pipeline
110
  if prompt_wav:
111
  results = asr_pipeline(prompt_wav)
112
  return results['text']
113
  return "No valid input detected."
114
 
115
- # LAST_UPLOADED = ""
116
- # def switch_selected(select_which):
117
- # # Check the file type (assuming WAV file)
118
- # if select_which == "上傳檔案" and prompt_wav_upload is not None:
119
- # prompt_wav = prompt_wav_upload
120
- # LAST_UPLOADED = "upload"
121
- # elif select_which == "麥克風" and prompt_wav_record is not None:
122
- # prompt_wav = prompt_wav_record
123
- # return "麥克風"
124
-
125
- def demo_get_audio(tts_text):
126
- sample_wav = 'sample.wav'
127
- speech, sample_rate = torchaudio.load(sample_wav)
128
-
129
- return sample_rate, speech
130
-
131
  def main():
132
  with gr.Blocks(title="BreezyVoice 語音合成系統", theme="default") as demo:
133
- # Title and About section at the top
134
- gr.Markdown("# BreezyVoice 語音合成系統")
135
- # gr.Markdown(
136
- # """## 僅需5秒語音樣本,就可輸出擬真人聲。"""
137
- # )
138
  gr.Markdown(
139
- """#### 此沙盒使用 Huggingface Zero GPU (A100)
 
 
140
 
141
- 為了加快推理速度,g2pw注音標註並未被啟動。"""
142
  )
143
 
144
  # All content arranged in a single column
145
  with gr.Column():
146
  # Configuration Section
147
-
148
 
149
-
150
  # Grouping prompt audio inputs and auto speech recognition in one block using Markdown
151
  gr.Markdown("### 步驟 1. 音訊樣本輸入 & 音訊樣本文本輸入")
152
  gr.Markdown("選擇 prompt 音訊檔案或錄製 prompt 音訊 (5~15秒),並手動校對自動產生的音訊樣本文本。")
153
- prompt_wav_upload = gr.Audio(
154
- sources='upload',
155
  type='filepath',
156
- label='選擇 prompt 音訊檔案(確保取樣率不低於 16khz'
157
  )
158
- prompt_wav_record = gr.Audio(
159
- sources='microphone',
160
- type='filepath',
161
- label='錄製 prompt 音訊檔案'
162
- )
163
-
164
- with gr.Blocks():
165
- select_which = gr.Radio(["上傳檔案", "麥克風"], label="音訊來源", interactive=True )
166
  with gr.Blocks():
167
  prompt_text = gr.Textbox(
168
  label="音訊樣本文本輸入(此欄位應與音檔內容完全相同)",
@@ -170,35 +123,22 @@ def main():
170
  placeholder="音訊樣本文本"
171
  )
172
 
173
- # Automatic speech recognition when either prompt audio input changes
174
- def a(X):
175
- return "上傳檔案"
176
- prompt_wav_upload.change(
177
- fn=a,#lambda file: "上傳檔案",
178
- inputs=[prompt_wav_upload],
179
- outputs=select_which
180
  )
181
 
182
-
183
-
184
-
185
-
186
- prompt_wav_record.change(
187
- fn=lambda recording: "麥克風",
188
- inputs=[prompt_wav_record],
189
- outputs=select_which
190
  )
191
 
192
- select_which.change(
193
- fn=generate_text,
194
- inputs=[prompt_wav_upload, prompt_wav_record, select_which],
195
- outputs=prompt_text
196
- )
197
- # select_which.change(
198
- # fn=switch_selected,
199
- # inputs=[select_which],
200
- # outputs= None
201
- # )
202
  # Input Section: Synthesis Text
203
 
204
  gr.Markdown("### 步驟 2.合成文本輸入")
@@ -206,7 +146,7 @@ def main():
206
  label="輸入想要合成的文本",
207
  lines=2,
208
  placeholder="請輸入想要合成的文本...",
209
- value="你好,歡迎光臨"
210
  )
211
 
212
 
@@ -228,16 +168,6 @@ def main():
228
  # interactive=True
229
  # )
230
 
231
- gr.Examples(
232
- examples=[
233
- ["examples/commonvoice-example-1.mp3", "明月幾時有,去問氣象局"],
234
- ["examples/commonvoice-example-2.mp3", "雲林縣斗六市與林內鄉交界"],
235
- ["examples/commonvoice-example-3.mp3", "法律應保障所有的人獲得相同的發展結果"]
236
- ],
237
- inputs=[prompt_wav_upload, prompt_text],
238
- label="範例"
239
- )
240
-
241
  generate_button = gr.Button("生成音訊")
242
  audio_output = gr.Audio(label="合成音訊")
243
 
@@ -245,13 +175,12 @@ def main():
245
  seed_button.click(fn=generate_seed, inputs=[], outputs=seed)
246
  generate_button.click(
247
  fn=generate_audio,
248
- inputs=[tts_text, prompt_text, prompt_wav_upload, prompt_wav_record, seed, select_which],
249
  outputs=audio_output
250
  )
251
 
252
- demo.queue(max_size=10, default_concurrency_limit=1)
253
  demo.launch()
254
-
255
  if __name__ == '__main__':
256
  cosyvoice = CosyVoice('Splend1dchan/BreezyVoice')
257
  asr_pipeline = pipeline(
 
66
  return speech
67
 
68
  @spaces.GPU
69
+ def generate_audio(tts_text, prompt_text, prompt_wav, seed):
 
 
 
 
 
 
70
  # if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
71
 
72
  prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
 
88
 
89
 
90
  @spaces.GPU
91
+ def generate_text(prompt_wav):
 
 
 
 
 
 
 
 
 
 
 
 
92
  if prompt_wav:
93
  results = asr_pipeline(prompt_wav)
94
  return results['text']
95
  return "No valid input detected."
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  def main():
98
  with gr.Blocks(title="BreezyVoice 語音合成系統", theme="default") as demo:
 
 
 
 
 
99
  gr.Markdown(
100
+ """# BreezyVoice 語音合成系統
101
+
102
+ #### Runs on Huggingface Zero GPU (A100)
103
 
104
+ 為了加快推理速度,g2pw 注音標註並未被啟動。"""
105
  )
106
 
107
  # All content arranged in a single column
108
  with gr.Column():
109
  # Configuration Section
 
110
 
 
111
  # Grouping prompt audio inputs and auto speech recognition in one block using Markdown
112
  gr.Markdown("### 步驟 1. 音訊樣本輸入 & 音訊樣本文本輸入")
113
  gr.Markdown("選擇 prompt 音訊檔案或錄製 prompt 音訊 (5~15秒),並手動校對自動產生的音訊樣本文本。")
114
+ prompt_wav = gr.Audio(
 
115
  type='filepath',
116
+ label='選擇 prompt 音訊檔案(確保取樣率不低於 16khz)或錄製 prompt 音訊'
117
  )
118
+
 
 
 
 
 
 
 
119
  with gr.Blocks():
120
  prompt_text = gr.Textbox(
121
  label="音訊樣本文本輸入(此欄位應與音檔內容完全相同)",
 
123
  placeholder="音訊樣本文本"
124
  )
125
 
126
+ prompt_wav.input(
127
+ fn=generate_text,
128
+ inputs=[prompt_wav],
129
+ outputs=prompt_text
 
 
 
130
  )
131
 
132
+ gr.Examples(
133
+ examples=[
134
+ ["examples/commonvoice-example-1.mp3", "明月��時有,去問氣象局"],
135
+ ["examples/commonvoice-example-2.mp3", "雲林縣斗六市與林內鄉交界"],
136
+ ["examples/commonvoice-example-3.mp3", "法律應保障所有的人獲得相同的發展結果"]
137
+ ],
138
+ inputs=[prompt_wav, prompt_text],
139
+ label="範例"
140
  )
141
 
 
 
 
 
 
 
 
 
 
 
142
  # Input Section: Synthesis Text
143
 
144
  gr.Markdown("### 步驟 2.合成文本輸入")
 
146
  label="輸入想要合成的文本",
147
  lines=2,
148
  placeholder="請輸入想要合成的文本...",
149
+ value="我今天忙了一整天,現在好想睡覺喔 QQ"
150
  )
151
 
152
 
 
168
  # interactive=True
169
  # )
170
 
 
 
 
 
 
 
 
 
 
 
171
  generate_button = gr.Button("生成音訊")
172
  audio_output = gr.Audio(label="合成音訊")
173
 
 
175
  seed_button.click(fn=generate_seed, inputs=[], outputs=seed)
176
  generate_button.click(
177
  fn=generate_audio,
178
+ inputs=[tts_text, prompt_text, prompt_wav, seed],
179
  outputs=audio_output
180
  )
181
 
 
182
  demo.launch()
183
+
184
  if __name__ == '__main__':
185
  cosyvoice = CosyVoice('Splend1dchan/BreezyVoice')
186
  asr_pipeline = pipeline(