Spaces:

OpenSound
/

SSR-Speech

Running on Zero

App Files Files Community

OpenSound commited on Jan 2

Commit

bc21a92

verified ·

1 Parent(s): 951dfe7

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -6

app.py CHANGED Viewed

@@ -333,7 +333,7 @@ def run_edit_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
     audio_tensors = []
     # save segments for comparison
     new_audio = new_audio[0].cpu()
-    torchaudio.save(audio_path, new_audio, codec_audio_sr)
     audio_tensors.append(new_audio)
     output_audio = get_output_audio(audio_tensors, codec_audio_sr)
@@ -423,6 +423,7 @@ def run_tts_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
     audio_tensors = []
     # save segments for comparison
     new_audio = new_audio[0].cpu()
     torchaudio.save(audio_path, new_audio, codec_audio_sr)
     [new_transcript, new_segments, _, _] = transcribe_en(audio_path)
@@ -532,7 +533,7 @@ def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
     audio_tensors = []
     # save segments for comparison
     new_audio = new_audio[0].cpu()
-    torchaudio.save(audio_path, new_audio, codec_audio_sr)
     audio_tensors.append(new_audio)
     output_audio = get_output_audio(audio_tensors, codec_audio_sr)
@@ -626,6 +627,7 @@ def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
     audio_tensors = []
     # save segments for comparison
     new_audio = new_audio[0].cpu()
     torchaudio.save(audio_path, new_audio, codec_audio_sr)
     [new_transcript, new_segments, _,_] = transcribe_zh(audio_path)
@@ -837,8 +839,8 @@ if __name__ == "__main__":
                                             info="set to 1 to use classifer-free guidance, change if you don't like the results")
                         cfg_coef3 = gr.Number(label="cfg_coef", value=1.5,
                                             info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
-                        cfg_stride3 = gr.Number(label="cfg_stride", value=1,
-                                            info="cfg stride, 1 is a good value for Mandarin, change if you don't like the results")
                         prompt_length3 = gr.Number(label="prompt_length", value=3,
                                             info="used for tts prompt, will automatically cut the prompt audio to this length")
                         sub_amount3 = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
@@ -891,8 +893,8 @@ if __name__ == "__main__":
                                             info="set to 1 to use classifer-free guidance, change if you don't like the results")
                         cfg_coef4 = gr.Number(label="cfg_coef", value=1.5,
                                             info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
-                        cfg_stride4 = gr.Number(label="cfg_stride", value=1,
-                                            info="cfg stride, 1 is a good value for Mandarin, change if you don't like the results")
                         prompt_length4 = gr.Number(label="prompt_length", value=3,
                                             info="used for tts prompt, will automatically cut the prompt audio to this length")
                         sub_amount4 = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")

     audio_tensors = []
     # save segments for comparison
     new_audio = new_audio[0].cpu()
+    # torchaudio.save(audio_path, new_audio, codec_audio_sr)
     audio_tensors.append(new_audio)
     output_audio = get_output_audio(audio_tensors, codec_audio_sr)
     audio_tensors = []
     # save segments for comparison
     new_audio = new_audio[0].cpu()
+    audio_path = audio_path.replace('.','_tmp.')
     torchaudio.save(audio_path, new_audio, codec_audio_sr)
     [new_transcript, new_segments, _, _] = transcribe_en(audio_path)
     audio_tensors = []
     # save segments for comparison
     new_audio = new_audio[0].cpu()
+    # torchaudio.save(audio_path, new_audio, codec_audio_sr)
     audio_tensors.append(new_audio)
     output_audio = get_output_audio(audio_tensors, codec_audio_sr)
     audio_tensors = []
     # save segments for comparison
     new_audio = new_audio[0].cpu()
+    audio_path = audio_path.replace('.','_tmp.')
     torchaudio.save(audio_path, new_audio, codec_audio_sr)
     [new_transcript, new_segments, _,_] = transcribe_zh(audio_path)
                                             info="set to 1 to use classifer-free guidance, change if you don't like the results")
                         cfg_coef3 = gr.Number(label="cfg_coef", value=1.5,
                                             info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
+                        cfg_stride3 = gr.Number(label="cfg_stride", value=3,
+                                            info="cfg stride, 3 is a good value for Mandarin, change if you don't like the results")
                         prompt_length3 = gr.Number(label="prompt_length", value=3,
                                             info="used for tts prompt, will automatically cut the prompt audio to this length")
                         sub_amount3 = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
                                             info="set to 1 to use classifer-free guidance, change if you don't like the results")
                         cfg_coef4 = gr.Number(label="cfg_coef", value=1.5,
                                             info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
+                        cfg_stride4 = gr.Number(label="cfg_stride", value=3,
+                                            info="cfg stride, 3 is a good value for Mandarin, change if you don't like the results")
                         prompt_length4 = gr.Number(label="prompt_length", value=3,
                                             info="used for tts prompt, will automatically cut the prompt audio to this length")
                         sub_amount4 = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")