OpenSound commited on
Commit
310c080
·
1 Parent(s): 6b40e6f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -16
app.py CHANGED
@@ -207,7 +207,7 @@ def replace_numbers_with_words(sentence):
207
  return re.sub(r'\b\d+\b', replace_with_words, sentence) # Regular expression that matches numbers
208
 
209
  @spaces.GPU
210
- def run_edit_en(seed, sub_amount, aug_text, cfg_coef, prompt_length,
211
  audio_path, original_transcript, transcript):
212
 
213
  codec_audio_sr = 16000
@@ -286,7 +286,7 @@ def run_edit_en(seed, sub_amount, aug_text, cfg_coef, prompt_length,
286
  ssrspeech_model_en["text_tokenizer"],
287
  ssrspeech_model_en["audio_tokenizer"],
288
  audio_path, orig_transcript, target_transcript, mask_interval,
289
- cfg_coef, aug_text, False, True, False,
290
  device, decode_config
291
  )
292
  audio_tensors = []
@@ -302,7 +302,7 @@ def run_edit_en(seed, sub_amount, aug_text, cfg_coef, prompt_length,
302
 
303
 
304
  @spaces.GPU
305
- def run_tts_en(seed, sub_amount, aug_text, cfg_coef, prompt_length,
306
  audio_path, original_transcript, transcript):
307
 
308
  codec_audio_sr = 16000
@@ -376,7 +376,7 @@ def run_tts_en(seed, sub_amount, aug_text, cfg_coef, prompt_length,
376
  ssrspeech_model_en["text_tokenizer"],
377
  ssrspeech_model_en["audio_tokenizer"],
378
  audio_path, orig_transcript, target_transcript, mask_interval,
379
- cfg_coef, aug_text, False, True, True,
380
  device, decode_config
381
  )
382
  audio_tensors = []
@@ -402,7 +402,7 @@ def run_tts_en(seed, sub_amount, aug_text, cfg_coef, prompt_length,
402
 
403
 
404
  @spaces.GPU
405
- def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, prompt_length,
406
  audio_path, original_transcript, transcript):
407
 
408
  codec_audio_sr = 16000
@@ -485,7 +485,7 @@ def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, prompt_length,
485
  ssrspeech_model_zh["text_tokenizer"],
486
  ssrspeech_model_zh["audio_tokenizer"],
487
  audio_path, orig_transcript, target_transcript, mask_interval,
488
- cfg_coef, aug_text, False, True, False,
489
  device, decode_config
490
  )
491
  audio_tensors = []
@@ -500,7 +500,7 @@ def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, prompt_length,
500
 
501
 
502
  @spaces.GPU
503
- def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, prompt_length,
504
  audio_path, original_transcript, transcript):
505
 
506
  codec_audio_sr = 16000
@@ -579,7 +579,7 @@ def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, prompt_length,
579
  ssrspeech_model_zh["text_tokenizer"],
580
  ssrspeech_model_zh["audio_tokenizer"],
581
  audio_path, orig_transcript, target_transcript, mask_interval,
582
- cfg_coef, aug_text, False, True, True,
583
  device, decode_config
584
  )
585
  audio_tensors = []
@@ -672,6 +672,8 @@ if __name__ == "__main__":
672
  info="set to 1 to use classifer-free guidance, change if you don't like the results")
673
  cfg_coef = gr.Number(label="cfg_coef", value=1.5,
674
  info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
 
 
675
  prompt_length = gr.Number(label="prompt_length", value=3,
676
  info="used for tts prompt, will automatically cut the prompt audio to this length")
677
  sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
@@ -687,7 +689,7 @@ if __name__ == "__main__":
687
  run_btn.click(fn=run_edit_en,
688
  inputs=[
689
  seed, sub_amount,
690
- aug_text, cfg_coef, prompt_length,
691
  input_audio, original_transcript, transcript,
692
  ],
693
  outputs=[output_audio, success_output])
@@ -695,7 +697,7 @@ if __name__ == "__main__":
695
  transcript.submit(fn=run_edit_en,
696
  inputs=[
697
  seed, sub_amount,
698
- aug_text, cfg_coef, prompt_length,
699
  input_audio, original_transcript, transcript,
700
  ],
701
  outputs=[output_audio, success_output]
@@ -726,6 +728,8 @@ if __name__ == "__main__":
726
  info="set to 1 to use classifer-free guidance, change if you don't like the results")
727
  cfg_coef = gr.Number(label="cfg_coef", value=1.5,
728
  info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
 
 
729
  prompt_length = gr.Number(label="prompt_length", value=3,
730
  info="used for tts prompt, will automatically cut the prompt audio to this length")
731
  sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
@@ -741,7 +745,7 @@ if __name__ == "__main__":
741
  run_btn.click(fn=run_tts_en,
742
  inputs=[
743
  seed, sub_amount,
744
- aug_text, cfg_coef, prompt_length,
745
  input_audio, original_transcript, transcript,
746
  ],
747
  outputs=[output_audio, success_output])
@@ -749,7 +753,7 @@ if __name__ == "__main__":
749
  transcript.submit(fn=run_tts_en,
750
  inputs=[
751
  seed, sub_amount,
752
- aug_text, cfg_coef, prompt_length,
753
  input_audio, original_transcript, transcript,
754
  ],
755
  outputs=[output_audio, success_output]
@@ -780,6 +784,8 @@ if __name__ == "__main__":
780
  info="set to 1 to use classifer-free guidance, change if you don't like the results")
781
  cfg_coef = gr.Number(label="cfg_coef", value=1.5,
782
  info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
 
 
783
  prompt_length = gr.Number(label="prompt_length", value=3,
784
  info="used for tts prompt, will automatically cut the prompt audio to this length")
785
  sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
@@ -795,7 +801,7 @@ if __name__ == "__main__":
795
  run_btn.click(fn=run_edit_zh,
796
  inputs=[
797
  seed, sub_amount,
798
- aug_text, cfg_coef, prompt_length,
799
  input_audio, original_transcript, transcript,
800
  ],
801
  outputs=[output_audio, success_output])
@@ -803,7 +809,7 @@ if __name__ == "__main__":
803
  transcript.submit(fn=run_edit_zh,
804
  inputs=[
805
  seed, sub_amount,
806
- aug_text, cfg_coef, prompt_length,
807
  input_audio, original_transcript, transcript,
808
  ],
809
  outputs=[output_audio, success_output]
@@ -834,6 +840,8 @@ if __name__ == "__main__":
834
  info="set to 1 to use classifer-free guidance, change if you don't like the results")
835
  cfg_coef = gr.Number(label="cfg_coef", value=1.5,
836
  info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
 
 
837
  prompt_length = gr.Number(label="prompt_length", value=3,
838
  info="used for tts prompt, will automatically cut the prompt audio to this length")
839
  sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
@@ -849,7 +857,7 @@ if __name__ == "__main__":
849
  run_btn.click(fn=run_tts_zh,
850
  inputs=[
851
  seed, sub_amount,
852
- aug_text, cfg_coef, prompt_length,
853
  input_audio, original_transcript, transcript,
854
  ],
855
  outputs=[output_audio, success_output])
@@ -857,7 +865,7 @@ if __name__ == "__main__":
857
  transcript.submit(fn=run_tts_zh,
858
  inputs=[
859
  seed, sub_amount,
860
- aug_text, cfg_coef, prompt_length,
861
  input_audio, original_transcript, transcript,
862
  ],
863
  outputs=[output_audio, success_output]
 
207
  return re.sub(r'\b\d+\b', replace_with_words, sentence) # Regular expression that matches numbers
208
 
209
  @spaces.GPU
210
+ def run_edit_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
211
  audio_path, original_transcript, transcript):
212
 
213
  codec_audio_sr = 16000
 
286
  ssrspeech_model_en["text_tokenizer"],
287
  ssrspeech_model_en["audio_tokenizer"],
288
  audio_path, orig_transcript, target_transcript, mask_interval,
289
+ cfg_coef, cfg_stride, aug_text, False, True, False,
290
  device, decode_config
291
  )
292
  audio_tensors = []
 
302
 
303
 
304
  @spaces.GPU
305
+ def run_tts_en(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
306
  audio_path, original_transcript, transcript):
307
 
308
  codec_audio_sr = 16000
 
376
  ssrspeech_model_en["text_tokenizer"],
377
  ssrspeech_model_en["audio_tokenizer"],
378
  audio_path, orig_transcript, target_transcript, mask_interval,
379
+ cfg_coef, cfg_stride, aug_text, False, True, True,
380
  device, decode_config
381
  )
382
  audio_tensors = []
 
402
 
403
 
404
  @spaces.GPU
405
+ def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
406
  audio_path, original_transcript, transcript):
407
 
408
  codec_audio_sr = 16000
 
485
  ssrspeech_model_zh["text_tokenizer"],
486
  ssrspeech_model_zh["audio_tokenizer"],
487
  audio_path, orig_transcript, target_transcript, mask_interval,
488
+ cfg_coef, cfg_stride, aug_text, False, True, False,
489
  device, decode_config
490
  )
491
  audio_tensors = []
 
500
 
501
 
502
  @spaces.GPU
503
+ def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
504
  audio_path, original_transcript, transcript):
505
 
506
  codec_audio_sr = 16000
 
579
  ssrspeech_model_zh["text_tokenizer"],
580
  ssrspeech_model_zh["audio_tokenizer"],
581
  audio_path, orig_transcript, target_transcript, mask_interval,
582
+ cfg_coef, cfg_stride, aug_text, False, True, True,
583
  device, decode_config
584
  )
585
  audio_tensors = []
 
672
  info="set to 1 to use classifer-free guidance, change if you don't like the results")
673
  cfg_coef = gr.Number(label="cfg_coef", value=1.5,
674
  info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
675
+ cfg_stride = gr.Number(label="cfg_stride", value=5,
676
+ info="cfg stride, 5 is a good value for English, change if you don't like the results")
677
  prompt_length = gr.Number(label="prompt_length", value=3,
678
  info="used for tts prompt, will automatically cut the prompt audio to this length")
679
  sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
 
689
  run_btn.click(fn=run_edit_en,
690
  inputs=[
691
  seed, sub_amount,
692
+ aug_text, cfg_coef, cfg_stride, prompt_length,
693
  input_audio, original_transcript, transcript,
694
  ],
695
  outputs=[output_audio, success_output])
 
697
  transcript.submit(fn=run_edit_en,
698
  inputs=[
699
  seed, sub_amount,
700
+ aug_text, cfg_coef, cfg_stride, prompt_length,
701
  input_audio, original_transcript, transcript,
702
  ],
703
  outputs=[output_audio, success_output]
 
728
  info="set to 1 to use classifer-free guidance, change if you don't like the results")
729
  cfg_coef = gr.Number(label="cfg_coef", value=1.5,
730
  info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
731
+ cfg_stride = gr.Number(label="cfg_stride", value=5,
732
+ info="cfg stride, 5 is a good value for English, change if you don't like the results")
733
  prompt_length = gr.Number(label="prompt_length", value=3,
734
  info="used for tts prompt, will automatically cut the prompt audio to this length")
735
  sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
 
745
  run_btn.click(fn=run_tts_en,
746
  inputs=[
747
  seed, sub_amount,
748
+ aug_text, cfg_coef, cfg_stride, prompt_length,
749
  input_audio, original_transcript, transcript,
750
  ],
751
  outputs=[output_audio, success_output])
 
753
  transcript.submit(fn=run_tts_en,
754
  inputs=[
755
  seed, sub_amount,
756
+ aug_text, cfg_coef, cfg_stride, prompt_length,
757
  input_audio, original_transcript, transcript,
758
  ],
759
  outputs=[output_audio, success_output]
 
784
  info="set to 1 to use classifer-free guidance, change if you don't like the results")
785
  cfg_coef = gr.Number(label="cfg_coef", value=1.5,
786
  info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
787
+ cfg_stride = gr.Number(label="cfg_stride", value=1,
788
+ info="cfg stride, 1 is a good value for Mandarin, change if you don't like the results")
789
  prompt_length = gr.Number(label="prompt_length", value=3,
790
  info="used for tts prompt, will automatically cut the prompt audio to this length")
791
  sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
 
801
  run_btn.click(fn=run_edit_zh,
802
  inputs=[
803
  seed, sub_amount,
804
+ aug_text, cfg_coef, cfg_stride, prompt_length,
805
  input_audio, original_transcript, transcript,
806
  ],
807
  outputs=[output_audio, success_output])
 
809
  transcript.submit(fn=run_edit_zh,
810
  inputs=[
811
  seed, sub_amount,
812
+ aug_text, cfg_coef, cfg_stride, prompt_length,
813
  input_audio, original_transcript, transcript,
814
  ],
815
  outputs=[output_audio, success_output]
 
840
  info="set to 1 to use classifer-free guidance, change if you don't like the results")
841
  cfg_coef = gr.Number(label="cfg_coef", value=1.5,
842
  info="cfg guidance scale, 1.5 is a good value, change if you don't like the results")
843
+ cfg_stride = gr.Number(label="cfg_stride", value=1,
844
+ info="cfg stride, 1 is a good value for Mandarin, change if you don't like the results")
845
  prompt_length = gr.Number(label="prompt_length", value=3,
846
  info="used for tts prompt, will automatically cut the prompt audio to this length")
847
  sub_amount = gr.Number(label="sub_amount", value=0.12, info="margin to the left and right of the editing segment, change if you don't like the results")
 
857
  run_btn.click(fn=run_tts_zh,
858
  inputs=[
859
  seed, sub_amount,
860
+ aug_text, cfg_coef, cfg_stride, prompt_length,
861
  input_audio, original_transcript, transcript,
862
  ],
863
  outputs=[output_audio, success_output])
 
865
  transcript.submit(fn=run_tts_zh,
866
  inputs=[
867
  seed, sub_amount,
868
+ aug_text, cfg_coef, cfg_stride, prompt_length,
869
  input_audio, original_transcript, transcript,
870
  ],
871
  outputs=[output_audio, success_output]