Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -207,7 +207,7 @@ def replace_numbers_with_words(sentence):
|
|
207 |
return re.sub(r'\b\d+\b', replace_with_words, sentence) # Regular expression that matches numbers
|
208 |
|
209 |
@spaces.GPU
|
210 |
-
def run_edit_en(seed, sub_amount, aug_text, cfg_coef, prompt_length,
|
211 |
audio_path, original_transcript, transcript):
|
212 |
|
213 |
codec_audio_sr = 16000
|
@@ -286,7 +286,7 @@ def run_edit_en(seed, sub_amount, aug_text, cfg_coef, prompt_length,
|
|
286 |
ssrspeech_model_en["text_tokenizer"],
|
287 |
ssrspeech_model_en["audio_tokenizer"],
|
288 |
audio_path, orig_transcript, target_transcript, mask_interval,
|
289 |
-
cfg_coef, aug_text, False, True, False,
|
290 |
device, decode_config
|
291 |
)
|
292 |
audio_tensors = []
|
@@ -302,7 +302,7 @@ def run_edit_en(seed, sub_amount, aug_text, cfg_coef, prompt_length,
|
|
302 |
|
303 |
|
304 |
@spaces.GPU
|
305 |
-
def run_tts_en(seed, sub_amount, aug_text, cfg_coef, prompt_length,
|
306 |
audio_path, original_transcript, transcript):
|
307 |
|
308 |
codec_audio_sr = 16000
|
@@ -376,7 +376,7 @@ def run_tts_en(seed, sub_amount, aug_text, cfg_coef, prompt_length,
|
|
376 |
ssrspeech_model_en["text_tokenizer"],
|
377 |
ssrspeech_model_en["audio_tokenizer"],
|
378 |
audio_path, orig_transcript, target_transcript, mask_interval,
|
379 |
-
cfg_coef, aug_text, False, True, True,
|
380 |
device, decode_config
|
381 |
)
|
382 |
audio_tensors = []
|
@@ -402,7 +402,7 @@ def run_tts_en(seed, sub_amount, aug_text, cfg_coef, prompt_length,
|
|
402 |
|
403 |
|
404 |
@spaces.GPU
|
405 |
-
def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, prompt_length,
|
406 |
audio_path, original_transcript, transcript):
|
407 |
|
408 |
codec_audio_sr = 16000
|
@@ -485,7 +485,7 @@ def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, prompt_length,
|
|
485 |
ssrspeech_model_zh["text_tokenizer"],
|
486 |
ssrspeech_model_zh["audio_tokenizer"],
|
487 |
audio_path, orig_transcript, target_transcript, mask_interval,
|
488 |
-
cfg_coef, aug_text, False, True, False,
|
489 |
device, decode_config
|
490 |
)
|
491 |
audio_tensors = []
|
@@ -500,7 +500,7 @@ def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, prompt_length,
|
|
500 |
|
501 |
|
502 |
@spaces.GPU
|
503 |
-
def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, prompt_length,
|
504 |
audio_path, original_transcript, transcript):
|
505 |
|
506 |
codec_audio_sr = 16000
|
@@ -579,7 +579,7 @@ def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, prompt_length,
|
|
579 |
ssrspeech_model_zh["text_tokenizer"],
|
580 |
ssrspeech_model_zh["audio_tokenizer"],
|
581 |
audio_path, orig_transcript, target_transcript, mask_interval,
|
582 |
-
cfg_coef, aug_text, False, True, True,
|
583 |
device, decode_config
|
584 |
)
|
585 |
audio_tensors = []
|
@@ -672,6 +672,8 @@ if __name__ == "__main__":
|
|
672 |
info="set to 1 to use classifer-free guidance, change if you don't like the results")
|
673 |
cfg_coef = gr.Number(label="cfg_coef", value=1.5,
|
674 |
info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
|
|
|
|
|
675 |
prompt_length = gr.Number(label="prompt_length", value=3,
|
676 |
info="used for tts prompt, will automatically cut the prompt audio to this length")
|
677 |
sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
|
@@ -687,7 +689,7 @@ if __name__ == "__main__":
|
|
687 |
run_btn.click(fn=run_edit_en,
|
688 |
inputs=[
|
689 |
seed, sub_amount,
|
690 |
-
aug_text, cfg_coef, prompt_length,
|
691 |
input_audio, original_transcript, transcript,
|
692 |
],
|
693 |
outputs=[output_audio, success_output])
|
@@ -695,7 +697,7 @@ if __name__ == "__main__":
|
|
695 |
transcript.submit(fn=run_edit_en,
|
696 |
inputs=[
|
697 |
seed, sub_amount,
|
698 |
-
aug_text, cfg_coef, prompt_length,
|
699 |
input_audio, original_transcript, transcript,
|
700 |
],
|
701 |
outputs=[output_audio, success_output]
|
@@ -726,6 +728,8 @@ if __name__ == "__main__":
|
|
726 |
info="set to 1 to use classifer-free guidance, change if you don't like the results")
|
727 |
cfg_coef = gr.Number(label="cfg_coef", value=1.5,
|
728 |
info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
|
|
|
|
|
729 |
prompt_length = gr.Number(label="prompt_length", value=3,
|
730 |
info="used for tts prompt, will automatically cut the prompt audio to this length")
|
731 |
sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
|
@@ -741,7 +745,7 @@ if __name__ == "__main__":
|
|
741 |
run_btn.click(fn=run_tts_en,
|
742 |
inputs=[
|
743 |
seed, sub_amount,
|
744 |
-
aug_text, cfg_coef, prompt_length,
|
745 |
input_audio, original_transcript, transcript,
|
746 |
],
|
747 |
outputs=[output_audio, success_output])
|
@@ -749,7 +753,7 @@ if __name__ == "__main__":
|
|
749 |
transcript.submit(fn=run_tts_en,
|
750 |
inputs=[
|
751 |
seed, sub_amount,
|
752 |
-
aug_text, cfg_coef, prompt_length,
|
753 |
input_audio, original_transcript, transcript,
|
754 |
],
|
755 |
outputs=[output_audio, success_output]
|
@@ -780,6 +784,8 @@ if __name__ == "__main__":
|
|
780 |
info="set to 1 to use classifer-free guidance, change if you don't like the results")
|
781 |
cfg_coef = gr.Number(label="cfg_coef", value=1.5,
|
782 |
info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
|
|
|
|
|
783 |
prompt_length = gr.Number(label="prompt_length", value=3,
|
784 |
info="used for tts prompt, will automatically cut the prompt audio to this length")
|
785 |
sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
|
@@ -795,7 +801,7 @@ if __name__ == "__main__":
|
|
795 |
run_btn.click(fn=run_edit_zh,
|
796 |
inputs=[
|
797 |
seed, sub_amount,
|
798 |
-
aug_text, cfg_coef, prompt_length,
|
799 |
input_audio, original_transcript, transcript,
|
800 |
],
|
801 |
outputs=[output_audio, success_output])
|
@@ -803,7 +809,7 @@ if __name__ == "__main__":
|
|
803 |
transcript.submit(fn=run_edit_zh,
|
804 |
inputs=[
|
805 |
seed, sub_amount,
|
806 |
-
aug_text, cfg_coef, prompt_length,
|
807 |
input_audio, original_transcript, transcript,
|
808 |
],
|
809 |
outputs=[output_audio, success_output]
|
@@ -834,6 +840,8 @@ if __name__ == "__main__":
|
|
834 |
info="set to 1 to use classifer-free guidance, change if you don't like the results")
|
835 |
cfg_coef = gr.Number(label="cfg_coef", value=1.5,
|
836 |
info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
|
|
|
|
|
837 |
prompt_length = gr.Number(label="prompt_length", value=3,
|
838 |
info="used for tts prompt, will automatically cut the prompt audio to this length")
|
839 |
sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
|
@@ -849,7 +857,7 @@ if __name__ == "__main__":
|
|
849 |
run_btn.click(fn=run_tts_zh,
|
850 |
inputs=[
|
851 |
seed, sub_amount,
|
852 |
-
aug_text, cfg_coef, prompt_length,
|
853 |
input_audio, original_transcript, transcript,
|
854 |
],
|
855 |
outputs=[output_audio, success_output])
|
@@ -857,7 +865,7 @@ if __name__ == "__main__":
|
|
857 |
transcript.submit(fn=run_tts_zh,
|
858 |
inputs=[
|
859 |
seed, sub_amount,
|
860 |
-
aug_text, cfg_coef, prompt_length,
|
861 |
input_audio, original_transcript, transcript,
|
862 |
],
|
863 |
outputs=[output_audio, success_output]
|
|
|
207 |
return re.sub(r'\b\d+\b', replace_with_words, sentence) # Regular expression that matches numbers
|
208 |
|
209 |
@spaces.GPU
|
210 |
+
def run_edit_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
211 |
audio_path, original_transcript, transcript):
|
212 |
|
213 |
codec_audio_sr = 16000
|
|
|
286 |
ssrspeech_model_en["text_tokenizer"],
|
287 |
ssrspeech_model_en["audio_tokenizer"],
|
288 |
audio_path, orig_transcript, target_transcript, mask_interval,
|
289 |
+
cfg_coef, cfg_stride, aug_text, False, True, False,
|
290 |
device, decode_config
|
291 |
)
|
292 |
audio_tensors = []
|
|
|
302 |
|
303 |
|
304 |
@spaces.GPU
|
305 |
+
def run_tts_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
306 |
audio_path, original_transcript, transcript):
|
307 |
|
308 |
codec_audio_sr = 16000
|
|
|
376 |
ssrspeech_model_en["text_tokenizer"],
|
377 |
ssrspeech_model_en["audio_tokenizer"],
|
378 |
audio_path, orig_transcript, target_transcript, mask_interval,
|
379 |
+
cfg_coef, cfg_stride, aug_text, False, True, True,
|
380 |
device, decode_config
|
381 |
)
|
382 |
audio_tensors = []
|
|
|
402 |
|
403 |
|
404 |
@spaces.GPU
|
405 |
+
def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
406 |
audio_path, original_transcript, transcript):
|
407 |
|
408 |
codec_audio_sr = 16000
|
|
|
485 |
ssrspeech_model_zh["text_tokenizer"],
|
486 |
ssrspeech_model_zh["audio_tokenizer"],
|
487 |
audio_path, orig_transcript, target_transcript, mask_interval,
|
488 |
+
cfg_coef, cfg_stride, aug_text, False, True, False,
|
489 |
device, decode_config
|
490 |
)
|
491 |
audio_tensors = []
|
|
|
500 |
|
501 |
|
502 |
@spaces.GPU
|
503 |
+
def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
504 |
audio_path, original_transcript, transcript):
|
505 |
|
506 |
codec_audio_sr = 16000
|
|
|
579 |
ssrspeech_model_zh["text_tokenizer"],
|
580 |
ssrspeech_model_zh["audio_tokenizer"],
|
581 |
audio_path, orig_transcript, target_transcript, mask_interval,
|
582 |
+
cfg_coef, cfg_stride, aug_text, False, True, True,
|
583 |
device, decode_config
|
584 |
)
|
585 |
audio_tensors = []
|
|
|
672 |
info="set to 1 to use classifer-free guidance, change if you don't like the results")
|
673 |
cfg_coef = gr.Number(label="cfg_coef", value=1.5,
|
674 |
info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
|
675 |
+
cfg_stride = gr.Number(label="cfg_stride", value=5,
|
676 |
+
info="cfg stride, 5 is a good value for English, change if you don't like the results")
|
677 |
prompt_length = gr.Number(label="prompt_length", value=3,
|
678 |
info="used for tts prompt, will automatically cut the prompt audio to this length")
|
679 |
sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
|
|
|
689 |
run_btn.click(fn=run_edit_en,
|
690 |
inputs=[
|
691 |
seed, sub_amount,
|
692 |
+
aug_text, cfg_coef, cfg_stride, prompt_length,
|
693 |
input_audio, original_transcript, transcript,
|
694 |
],
|
695 |
outputs=[output_audio, success_output])
|
|
|
697 |
transcript.submit(fn=run_edit_en,
|
698 |
inputs=[
|
699 |
seed, sub_amount,
|
700 |
+
aug_text, cfg_coef, cfg_stride, prompt_length,
|
701 |
input_audio, original_transcript, transcript,
|
702 |
],
|
703 |
outputs=[output_audio, success_output]
|
|
|
728 |
info="set to 1 to use classifer-free guidance, change if you don't like the results")
|
729 |
cfg_coef = gr.Number(label="cfg_coef", value=1.5,
|
730 |
info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
|
731 |
+
cfg_stride = gr.Number(label="cfg_stride", value=5,
|
732 |
+
info="cfg stride, 5 is a good value for English, change if you don't like the results")
|
733 |
prompt_length = gr.Number(label="prompt_length", value=3,
|
734 |
info="used for tts prompt, will automatically cut the prompt audio to this length")
|
735 |
sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
|
|
|
745 |
run_btn.click(fn=run_tts_en,
|
746 |
inputs=[
|
747 |
seed, sub_amount,
|
748 |
+
aug_text, cfg_coef, cfg_stride, prompt_length,
|
749 |
input_audio, original_transcript, transcript,
|
750 |
],
|
751 |
outputs=[output_audio, success_output])
|
|
|
753 |
transcript.submit(fn=run_tts_en,
|
754 |
inputs=[
|
755 |
seed, sub_amount,
|
756 |
+
aug_text, cfg_coef, cfg_stride, prompt_length,
|
757 |
input_audio, original_transcript, transcript,
|
758 |
],
|
759 |
outputs=[output_audio, success_output]
|
|
|
784 |
info="set to 1 to use classifer-free guidance, change if you don't like the results")
|
785 |
cfg_coef = gr.Number(label="cfg_coef", value=1.5,
|
786 |
info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
|
787 |
+
cfg_stride = gr.Number(label="cfg_stride", value=1,
|
788 |
+
info="cfg stride, 1 is a good value for Mandarin, change if you don't like the results")
|
789 |
prompt_length = gr.Number(label="prompt_length", value=3,
|
790 |
info="used for tts prompt, will automatically cut the prompt audio to this length")
|
791 |
sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
|
|
|
801 |
run_btn.click(fn=run_edit_zh,
|
802 |
inputs=[
|
803 |
seed, sub_amount,
|
804 |
+
aug_text, cfg_coef, cfg_stride, prompt_length,
|
805 |
input_audio, original_transcript, transcript,
|
806 |
],
|
807 |
outputs=[output_audio, success_output])
|
|
|
809 |
transcript.submit(fn=run_edit_zh,
|
810 |
inputs=[
|
811 |
seed, sub_amount,
|
812 |
+
aug_text, cfg_coef, cfg_stride, prompt_length,
|
813 |
input_audio, original_transcript, transcript,
|
814 |
],
|
815 |
outputs=[output_audio, success_output]
|
|
|
840 |
info="set to 1 to use classifer-free guidance, change if you don't like the results")
|
841 |
cfg_coef = gr.Number(label="cfg_coef", value=1.5,
|
842 |
info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
|
843 |
+
cfg_stride = gr.Number(label="cfg_stride", value=1,
|
844 |
+
info="cfg stride, 1 is a good value for Mandarin, change if you don't like the results")
|
845 |
prompt_length = gr.Number(label="prompt_length", value=3,
|
846 |
info="used for tts prompt, will automatically cut the prompt audio to this length")
|
847 |
sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
|
|
|
857 |
run_btn.click(fn=run_tts_zh,
|
858 |
inputs=[
|
859 |
seed, sub_amount,
|
860 |
+
aug_text, cfg_coef, cfg_stride, prompt_length,
|
861 |
input_audio, original_transcript, transcript,
|
862 |
],
|
863 |
outputs=[output_audio, success_output])
|
|
|
865 |
transcript.submit(fn=run_tts_zh,
|
866 |
inputs=[
|
867 |
seed, sub_amount,
|
868 |
+
aug_text, cfg_coef, cfg_stride, prompt_length,
|
869 |
input_audio, original_transcript, transcript,
|
870 |
],
|
871 |
outputs=[output_audio, success_output]
|