BreezyVoice

Running on Zero

App Files Files Community

JacobLinCool commited on Feb 15

Commit

671d69d

verified ·

1 Parent(s): 71d3f58

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -95

app.py CHANGED Viewed

@@ -66,13 +66,7 @@ def postprocess(speech, top_db=60, hop_length=220, win_length=440):
     return speech
 @spaces.GPU
-def generate_audio(tts_text, prompt_text, prompt_wav_upload, prompt_wav_record, seed, select_which):
-    if select_which == "上傳檔案" and prompt_wav_upload is not None:
-        prompt_wav = prompt_wav_upload
-    elif select_which == "麥克風" and prompt_wav_record is not None:
-        prompt_wav = prompt_wav_record
-    else:
-        prompt_wav = None
     # if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
     prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
@@ -94,75 +88,34 @@ def generate_audio(tts_text, prompt_text, prompt_wav_upload, prompt_wav_record,
 @spaces.GPU
-def generate_text(prompt_wav_upload, prompt_wav_record, select_which):
-    # Determine which input to use based on the selection in select_which
-    if select_which == "上傳檔案" and prompt_wav_upload is not None:
-        prompt_wav = prompt_wav_upload
-        LAST_UPLOADED = "upload"
-    elif select_which == "麥克風" and prompt_wav_record is not None:
-        prompt_wav = prompt_wav_record
-        LAST_UPLOADED = "record"
-    else:
-        prompt_wav = None
-        LAST_UPLOADED = None
-    print(select_which)
-    # Process with ASR pipeline
     if prompt_wav:
         results = asr_pipeline(prompt_wav)
         return results['text']
     return "No valid input detected."
-# LAST_UPLOADED = ""
-# def switch_selected(select_which):
-#     # Check the file type (assuming WAV file)
-#     if select_which == "上傳檔案" and prompt_wav_upload is not None:
-#         prompt_wav = prompt_wav_upload
-#         LAST_UPLOADED = "upload"
-#     elif select_which == "麥克風" and prompt_wav_record is not None:
-#         prompt_wav = prompt_wav_record
-#     return "麥克風"
-def demo_get_audio(tts_text):
-    sample_wav = 'sample.wav'
-    speech, sample_rate = torchaudio.load(sample_wav)
-    return sample_rate, speech
 def main():
     with gr.Blocks(title="BreezyVoice 語音合成系統", theme="default") as demo:
-        # Title and About section at the top
-        gr.Markdown("# BreezyVoice 語音合成系統")
-        # gr.Markdown(
-        #     """## 僅需5秒語音樣本，就可輸出擬真人聲。"""
-        # )
         gr.Markdown(
-            """#### 此沙盒使用 Huggingface Zero GPU (A100)
-            為了加快推理速度，g2pw注音標註並未被啟動。"""
         )
         # All content arranged in a single column
         with gr.Column():
             # Configuration Section
             # Grouping prompt audio inputs and auto speech recognition in one block using Markdown
             gr.Markdown("### 步驟 1. 音訊樣本輸入 & 音訊樣本文本輸入")
             gr.Markdown("選擇 prompt 音訊檔案或錄製 prompt 音訊 (5~15秒)，並手動校對自動產生的音訊樣本文本。")
-            prompt_wav_upload = gr.Audio(
-                sources='upload',
                 type='filepath',
-                label='選擇 prompt 音訊檔案（確保取樣率不低於 16khz）'
             )
-            prompt_wav_record = gr.Audio(
-                sources='microphone',
-                type='filepath',
-                label='錄製 prompt 音訊檔案'
-            )
-            with gr.Blocks():
-                select_which = gr.Radio(["上傳檔案", "麥克風"], label="音訊來源", interactive=True )
             with gr.Blocks():
                 prompt_text = gr.Textbox(
                     label="音訊樣本文本輸入(此欄位應與音檔內容完全相同)",
@@ -170,35 +123,22 @@ def main():
                     placeholder="音訊樣本文本"
                 )
-            # Automatic speech recognition when either prompt audio input changes
-            def a(X):
-                return "上傳檔案"
-            prompt_wav_upload.change(
-                fn=a,#lambda file: "上傳檔案",
-                inputs=[prompt_wav_upload],
-                outputs=select_which
             )
-            prompt_wav_record.change(
-                fn=lambda recording: "麥克風",
-                inputs=[prompt_wav_record],
-                outputs=select_which
             )
-            select_which.change(
-                fn=generate_text,
-                inputs=[prompt_wav_upload, prompt_wav_record, select_which],
-                outputs=prompt_text
-            )
-            # select_which.change(
-            #     fn=switch_selected,
-            #     inputs=[select_which],
-            #     outputs= None
-            # )
             # Input Section: Synthesis Text
             gr.Markdown("### 步驟 2.合成文本輸入")
@@ -206,7 +146,7 @@ def main():
                 label="輸入想要合成的文本",
                 lines=2,
                 placeholder="請輸入想要合成的文本...",
-                value="你好，歡迎光臨"
             )
@@ -228,16 +168,6 @@ def main():
                 #     interactive=True
                 # )
-            gr.Examples(
-                examples=[
-                    ["examples/commonvoice-example-1.mp3", "明月幾時有，去問氣象局"],
-                    ["examples/commonvoice-example-2.mp3", "雲林縣斗六市與林內鄉交界"],
-                    ["examples/commonvoice-example-3.mp3", "法律應保障所有的人獲得相同的發展結果"]
-                ],
-                inputs=[prompt_wav_upload, prompt_text],
-                label="範例"
-            )
             generate_button = gr.Button("生成音訊")
             audio_output = gr.Audio(label="合成音訊")
@@ -245,13 +175,12 @@ def main():
             seed_button.click(fn=generate_seed, inputs=[], outputs=seed)
             generate_button.click(
                 fn=generate_audio,
-                inputs=[tts_text, prompt_text, prompt_wav_upload, prompt_wav_record, seed, select_which],
                 outputs=audio_output
             )
-        demo.queue(max_size=10, default_concurrency_limit=1)
         demo.launch()
 if __name__ == '__main__':
     cosyvoice = CosyVoice('Splend1dchan/BreezyVoice')
     asr_pipeline = pipeline(

     return speech
 @spaces.GPU
+def generate_audio(tts_text, prompt_text, prompt_wav, seed):
     # if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
     prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
 @spaces.GPU
+def generate_text(prompt_wav):
     if prompt_wav:
         results = asr_pipeline(prompt_wav)
         return results['text']
     return "No valid input detected."
 def main():
     with gr.Blocks(title="BreezyVoice 語音合成系統", theme="default") as demo:
         gr.Markdown(
+            """# BreezyVoice 語音合成系統
+            #### Runs on Huggingface Zero GPU (A100)
+            為了加快推理速度，g2pw 注音標註並未被啟動。"""
         )
         # All content arranged in a single column
         with gr.Column():
             # Configuration Section
             # Grouping prompt audio inputs and auto speech recognition in one block using Markdown
             gr.Markdown("### 步驟 1. 音訊樣本輸入 & 音訊樣本文本輸入")
             gr.Markdown("選擇 prompt 音訊檔案或錄製 prompt 音訊 (5~15秒)，並手動校對自動產生的音訊樣本文本。")
+            prompt_wav = gr.Audio(
                 type='filepath',
+                label='選擇 prompt 音訊檔案（確保取樣率不低於 16khz）或錄製 prompt 音訊'
             )
             with gr.Blocks():
                 prompt_text = gr.Textbox(
                     label="音訊樣本文本輸入(此欄位應與音檔內容完全相同)",
                     placeholder="音訊樣本文本"
                 )
+            prompt_wav.input(
+                fn=generate_text,
+                inputs=[prompt_wav],
+                outputs=prompt_text
             )
+            gr.Examples(
+                examples=[
+                    ["examples/commonvoice-example-1.mp3", "明月��時有，去問氣象局"],
+                    ["examples/commonvoice-example-2.mp3", "雲林縣斗六市與林內鄉交界"],
+                    ["examples/commonvoice-example-3.mp3", "法律應保障所有的人獲得相同的發展結果"]
+                ],
+                inputs=[prompt_wav, prompt_text],
+                label="範例"
             )
             # Input Section: Synthesis Text
             gr.Markdown("### 步驟 2.合成文本輸入")
                 label="輸入想要合成的文本",
                 lines=2,
                 placeholder="請輸入想要合成的文本...",
+                value="我今天忙了一整天，現在好想睡覺喔 QQ"
             )
                 #     interactive=True
                 # )
             generate_button = gr.Button("生成音訊")
             audio_output = gr.Audio(label="合成音訊")
             seed_button.click(fn=generate_seed, inputs=[], outputs=seed)
             generate_button.click(
                 fn=generate_audio,
+                inputs=[tts_text, prompt_text, prompt_wav, seed],
                 outputs=audio_output
             )
         demo.launch()
 if __name__ == '__main__':
     cosyvoice = CosyVoice('Splend1dchan/BreezyVoice')
     asr_pipeline = pipeline(