Spaces:

OpenSound
/

SSR-Speech

Running on Zero

App Files Files Community

OpenSound commited on Dec 22, 2024

Commit

310c080

1 Parent(s): 6b40e6f

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -16

app.py CHANGED Viewed

@@ -207,7 +207,7 @@ def replace_numbers_with_words(sentence):
     return re.sub(r'\b\d+\b', replace_with_words, sentence) # Regular expression that matches numbers
 @spaces.GPU
-def run_edit_en(seed, sub_amount, aug_text, cfg_coef, prompt_length,
         audio_path, original_transcript, transcript):
     codec_audio_sr = 16000
@@ -286,7 +286,7 @@ def run_edit_en(seed, sub_amount, aug_text, cfg_coef, prompt_length,
         ssrspeech_model_en["text_tokenizer"],
         ssrspeech_model_en["audio_tokenizer"],
         audio_path, orig_transcript, target_transcript, mask_interval,
-        cfg_coef, aug_text, False, True, False,
         device, decode_config
     )
     audio_tensors = []
@@ -302,7 +302,7 @@ def run_edit_en(seed, sub_amount, aug_text, cfg_coef, prompt_length,
 @spaces.GPU
-def run_tts_en(seed, sub_amount, aug_text, cfg_coef, prompt_length,
         audio_path, original_transcript, transcript):
     codec_audio_sr = 16000
@@ -376,7 +376,7 @@ def run_tts_en(seed, sub_amount, aug_text, cfg_coef, prompt_length,
         ssrspeech_model_en["text_tokenizer"],
         ssrspeech_model_en["audio_tokenizer"],
         audio_path, orig_transcript, target_transcript, mask_interval,
-        cfg_coef, aug_text, False, True, True,
         device, decode_config
     )
     audio_tensors = []
@@ -402,7 +402,7 @@ def run_tts_en(seed, sub_amount, aug_text, cfg_coef, prompt_length,
 @spaces.GPU
-def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, prompt_length,
         audio_path, original_transcript, transcript):
     codec_audio_sr = 16000
@@ -485,7 +485,7 @@ def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, prompt_length,
         ssrspeech_model_zh["text_tokenizer"],
         ssrspeech_model_zh["audio_tokenizer"],
         audio_path, orig_transcript, target_transcript, mask_interval,
-        cfg_coef, aug_text, False, True, False,
         device, decode_config
     )
     audio_tensors = []
@@ -500,7 +500,7 @@ def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, prompt_length,
 @spaces.GPU
-def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, prompt_length,
         audio_path, original_transcript, transcript):
     codec_audio_sr = 16000
@@ -579,7 +579,7 @@ def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, prompt_length,
         ssrspeech_model_zh["text_tokenizer"],
         ssrspeech_model_zh["audio_tokenizer"],
         audio_path, orig_transcript, target_transcript, mask_interval,
-        cfg_coef, aug_text, False, True, True,
         device, decode_config
     )
     audio_tensors = []
@@ -672,6 +672,8 @@ if __name__ == "__main__":
                                             info="set to 1 to use classifer-free guidance, change if you don't like the results")
                         cfg_coef = gr.Number(label="cfg_coef", value=1.5,
                                             info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
                         prompt_length = gr.Number(label="prompt_length", value=3,
                                             info="used for tts prompt, will automatically cut the prompt audio to this length")
                         sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
@@ -687,7 +689,7 @@ if __name__ == "__main__":
                 run_btn.click(fn=run_edit_en,
                             inputs=[
                                 seed, sub_amount,
-                                aug_text, cfg_coef, prompt_length,
                                 input_audio, original_transcript, transcript,
                             ],
                             outputs=[output_audio, success_output])
@@ -695,7 +697,7 @@ if __name__ == "__main__":
                 transcript.submit(fn=run_edit_en,
                         inputs=[
                                 seed, sub_amount,
-                                aug_text, cfg_coef, prompt_length,
                                 input_audio, original_transcript, transcript,
                         ],
                     outputs=[output_audio, success_output]
@@ -726,6 +728,8 @@ if __name__ == "__main__":
                                             info="set to 1 to use classifer-free guidance, change if you don't like the results")
                         cfg_coef = gr.Number(label="cfg_coef", value=1.5,
                                             info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
                         prompt_length = gr.Number(label="prompt_length", value=3,
                                             info="used for tts prompt, will automatically cut the prompt audio to this length")
                         sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
@@ -741,7 +745,7 @@ if __name__ == "__main__":
                 run_btn.click(fn=run_tts_en,
                             inputs=[
                                 seed, sub_amount,
-                                aug_text, cfg_coef, prompt_length,
                                 input_audio, original_transcript, transcript,
                             ],
                             outputs=[output_audio, success_output])
@@ -749,7 +753,7 @@ if __name__ == "__main__":
                 transcript.submit(fn=run_tts_en,
                         inputs=[
                                 seed, sub_amount,
-                                aug_text, cfg_coef, prompt_length,
                                 input_audio, original_transcript, transcript,
                         ],
                     outputs=[output_audio, success_output]
@@ -780,6 +784,8 @@ if __name__ == "__main__":
                                             info="set to 1 to use classifer-free guidance, change if you don't like the results")
                         cfg_coef = gr.Number(label="cfg_coef", value=1.5,
                                             info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
                         prompt_length = gr.Number(label="prompt_length", value=3,
                                             info="used for tts prompt, will automatically cut the prompt audio to this length")
                         sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
@@ -795,7 +801,7 @@ if __name__ == "__main__":
                 run_btn.click(fn=run_edit_zh,
                             inputs=[
                                 seed, sub_amount,
-                                aug_text, cfg_coef, prompt_length,
                                 input_audio, original_transcript, transcript,
                             ],
                             outputs=[output_audio, success_output])
@@ -803,7 +809,7 @@ if __name__ == "__main__":
                 transcript.submit(fn=run_edit_zh,
                         inputs=[
                                 seed, sub_amount,
-                                aug_text, cfg_coef, prompt_length,
                                 input_audio, original_transcript, transcript,
                         ],
                     outputs=[output_audio, success_output]
@@ -834,6 +840,8 @@ if __name__ == "__main__":
                                             info="set to 1 to use classifer-free guidance, change if you don't like the results")
                         cfg_coef = gr.Number(label="cfg_coef", value=1.5,
                                             info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
                         prompt_length = gr.Number(label="prompt_length", value=3,
                                             info="used for tts prompt, will automatically cut the prompt audio to this length")
                         sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
@@ -849,7 +857,7 @@ if __name__ == "__main__":
                 run_btn.click(fn=run_tts_zh,
                             inputs=[
                                 seed, sub_amount,
-                                aug_text, cfg_coef, prompt_length,
                                 input_audio, original_transcript, transcript,
                             ],
                             outputs=[output_audio, success_output])
@@ -857,7 +865,7 @@ if __name__ == "__main__":
                 transcript.submit(fn=run_tts_zh,
                         inputs=[
                                 seed, sub_amount,
-                                aug_text, cfg_coef, prompt_length,
                                 input_audio, original_transcript, transcript,
                         ],
                     outputs=[output_audio, success_output]

     return re.sub(r'\b\d+\b', replace_with_words, sentence) # Regular expression that matches numbers
 @spaces.GPU
+def run_edit_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
         audio_path, original_transcript, transcript):
     codec_audio_sr = 16000
         ssrspeech_model_en["text_tokenizer"],
         ssrspeech_model_en["audio_tokenizer"],
         audio_path, orig_transcript, target_transcript, mask_interval,
+        cfg_coef, cfg_stride, aug_text, False, True, False,
         device, decode_config
     )
     audio_tensors = []
 @spaces.GPU
+def run_tts_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
         audio_path, original_transcript, transcript):
     codec_audio_sr = 16000
         ssrspeech_model_en["text_tokenizer"],
         ssrspeech_model_en["audio_tokenizer"],
         audio_path, orig_transcript, target_transcript, mask_interval,
+        cfg_coef, cfg_stride, aug_text, False, True, True,
         device, decode_config
     )
     audio_tensors = []
 @spaces.GPU
+def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
         audio_path, original_transcript, transcript):
     codec_audio_sr = 16000
         ssrspeech_model_zh["text_tokenizer"],
         ssrspeech_model_zh["audio_tokenizer"],
         audio_path, orig_transcript, target_transcript, mask_interval,
+        cfg_coef, cfg_stride, aug_text, False, True, False,
         device, decode_config
     )
     audio_tensors = []
 @spaces.GPU
+def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
         audio_path, original_transcript, transcript):
     codec_audio_sr = 16000
         ssrspeech_model_zh["text_tokenizer"],
         ssrspeech_model_zh["audio_tokenizer"],
         audio_path, orig_transcript, target_transcript, mask_interval,
+        cfg_coef, cfg_stride, aug_text, False, True, True,
         device, decode_config
     )
     audio_tensors = []
                                             info="set to 1 to use classifer-free guidance, change if you don't like the results")
                         cfg_coef = gr.Number(label="cfg_coef", value=1.5,
                                             info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
+                        cfg_stride = gr.Number(label="cfg_stride", value=5,
+                                            info="cfg stride, 5 is a good value for English, change if you don't like the results")
                         prompt_length = gr.Number(label="prompt_length", value=3,
                                             info="used for tts prompt, will automatically cut the prompt audio to this length")
                         sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
                 run_btn.click(fn=run_edit_en,
                             inputs=[
                                 seed, sub_amount,
+                                aug_text, cfg_coef, cfg_stride, prompt_length,
                                 input_audio, original_transcript, transcript,
                             ],
                             outputs=[output_audio, success_output])
                 transcript.submit(fn=run_edit_en,
                         inputs=[
                                 seed, sub_amount,
+                                aug_text, cfg_coef, cfg_stride, prompt_length,
                                 input_audio, original_transcript, transcript,
                         ],
                     outputs=[output_audio, success_output]
                                             info="set to 1 to use classifer-free guidance, change if you don't like the results")
                         cfg_coef = gr.Number(label="cfg_coef", value=1.5,
                                             info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
+                        cfg_stride = gr.Number(label="cfg_stride", value=5,
+                                            info="cfg stride, 5 is a good value for English, change if you don't like the results")
                         prompt_length = gr.Number(label="prompt_length", value=3,
                                             info="used for tts prompt, will automatically cut the prompt audio to this length")
                         sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
                 run_btn.click(fn=run_tts_en,
                             inputs=[
                                 seed, sub_amount,
+                                aug_text, cfg_coef, cfg_stride, prompt_length,
                                 input_audio, original_transcript, transcript,
                             ],
                             outputs=[output_audio, success_output])
                 transcript.submit(fn=run_tts_en,
                         inputs=[
                                 seed, sub_amount,
+                                aug_text, cfg_coef, cfg_stride, prompt_length,
                                 input_audio, original_transcript, transcript,
                         ],
                     outputs=[output_audio, success_output]
                                             info="set to 1 to use classifer-free guidance, change if you don't like the results")
                         cfg_coef = gr.Number(label="cfg_coef", value=1.5,
                                             info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
+                        cfg_stride = gr.Number(label="cfg_stride", value=1,
+                                            info="cfg stride, 1 is a good value for Mandarin, change if you don't like the results")
                         prompt_length = gr.Number(label="prompt_length", value=3,
                                             info="used for tts prompt, will automatically cut the prompt audio to this length")
                         sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
                 run_btn.click(fn=run_edit_zh,
                             inputs=[
                                 seed, sub_amount,
+                                aug_text, cfg_coef, cfg_stride, prompt_length,
                                 input_audio, original_transcript, transcript,
                             ],
                             outputs=[output_audio, success_output])
                 transcript.submit(fn=run_edit_zh,
                         inputs=[
                                 seed, sub_amount,
+                                aug_text, cfg_coef, cfg_stride, prompt_length,
                                 input_audio, original_transcript, transcript,
                         ],
                     outputs=[output_audio, success_output]
                                             info="set to 1 to use classifer-free guidance, change if you don't like the results")
                         cfg_coef = gr.Number(label="cfg_coef", value=1.5,
                                             info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
+                        cfg_stride = gr.Number(label="cfg_stride", value=1,
+                                            info="cfg stride, 1 is a good value for Mandarin, change if you don't like the results")
                         prompt_length = gr.Number(label="prompt_length", value=3,
                                             info="used for tts prompt, will automatically cut the prompt audio to this length")
                         sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
                 run_btn.click(fn=run_tts_zh,
                             inputs=[
                                 seed, sub_amount,
+                                aug_text, cfg_coef, cfg_stride, prompt_length,
                                 input_audio, original_transcript, transcript,
                             ],
                             outputs=[output_audio, success_output])
                 transcript.submit(fn=run_tts_zh,
                         inputs=[
                                 seed, sub_amount,
+                                aug_text, cfg_coef, cfg_stride, prompt_length,
                                 input_audio, original_transcript, transcript,
                         ],
                     outputs=[output_audio, success_output]