OpenSound commited on
Commit
9ded2e7
·
verified ·
1 Parent(s): 1678017

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -19
app.py CHANGED
@@ -197,8 +197,6 @@ def transcribe_zh(audio_path):
197
  transcribe_model_name = "medium"
198
  transcribe_model = load_model(transcribe_model_name, device, asr_options={"suppress_numerals": True, "max_new_tokens": None, "clip_timestamps": None, "hallucination_silence_threshold": None}, language=language)
199
  segments = transcribe_model.transcribe(audio_path, batch_size=8)["segments"]
200
- for segment in segments:
201
- segment['text'] = replace_numbers_with_words(segment['text'])
202
  _, segments = align_zh(segments, audio_path)
203
  state = get_transcribe_state(segments)
204
  success_message = "<span style='color:green;'>Success: Transcribe completed successfully!</span>"
@@ -464,8 +462,8 @@ def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
464
  sf.write(audio_path, audio, 16000)
465
 
466
  # text normalization
467
- target_transcript = replace_numbers_with_words(transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
468
- orig_transcript = replace_numbers_with_words(original_transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
469
 
470
  [orig_transcript, segments, _, _] = transcribe_zh(audio_path)
471
 
@@ -562,8 +560,8 @@ def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
562
  sf.write(audio_path, audio, 16000)
563
 
564
  # text normalization
565
- target_transcript = replace_numbers_with_words(transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
566
- orig_transcript = replace_numbers_with_words(original_transcript).replace(" ", " ").replace(" ", " ").replace("\n", " ")
567
 
568
  [orig_transcript, segments, _, _] = transcribe_zh(audio_path)
569
 
@@ -589,7 +587,7 @@ def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
589
 
590
  audio, _ = librosa.load(audio_path, sr=16000, duration=cut_length)
591
  sf.write(audio_path, audio, 16000)
592
- [orig_transcript, segments, _] = transcribe_zh(audio_path)
593
 
594
 
595
  converter = opencc.OpenCC('t2s')
@@ -724,15 +722,23 @@ if __name__ == "__main__":
724
 
725
  semgents = gr.State() # not used
726
  state = gr.State() # not used
 
 
 
 
 
 
 
 
727
  transcribe_btn.click(fn=transcribe_en,
728
- inputs=[input_audio],
729
  outputs=[original_transcript, semgents, state, success_output])
730
 
731
  run_btn.click(fn=run_edit_en,
732
  inputs=[
733
  seed, sub_amount,
734
  aug_text, cfg_coef, cfg_stride, prompt_length,
735
- input_audio, original_transcript, transcript,
736
  ],
737
  outputs=[output_audio, success_output])
738
 
@@ -740,7 +746,7 @@ if __name__ == "__main__":
740
  inputs=[
741
  seed, sub_amount,
742
  aug_text, cfg_coef, cfg_stride, prompt_length,
743
- input_audio, original_transcript, transcript,
744
  ],
745
  outputs=[output_audio, success_output]
746
  )
@@ -780,15 +786,22 @@ if __name__ == "__main__":
780
 
781
  semgents = gr.State() # not used
782
  state = gr.State() # not used
 
 
 
 
 
 
 
783
  transcribe_btn.click(fn=transcribe_en,
784
- inputs=[input_audio],
785
  outputs=[original_transcript, semgents, state, success_output])
786
 
787
  run_btn.click(fn=run_tts_en,
788
  inputs=[
789
  seed, sub_amount,
790
  aug_text, cfg_coef, cfg_stride, prompt_length,
791
- input_audio, original_transcript, transcript,
792
  ],
793
  outputs=[output_audio, success_output])
794
 
@@ -796,7 +809,7 @@ if __name__ == "__main__":
796
  inputs=[
797
  seed, sub_amount,
798
  aug_text, cfg_coef, cfg_stride, prompt_length,
799
- input_audio, original_transcript, transcript,
800
  ],
801
  outputs=[output_audio, success_output]
802
  )
@@ -836,15 +849,22 @@ if __name__ == "__main__":
836
 
837
  semgents = gr.State() # not used
838
  state = gr.State() # not used
 
 
 
 
 
 
 
839
  transcribe_btn.click(fn=transcribe_zh,
840
- inputs=[input_audio],
841
  outputs=[original_transcript, semgents, state, success_output])
842
 
843
  run_btn.click(fn=run_edit_zh,
844
  inputs=[
845
  seed, sub_amount,
846
  aug_text, cfg_coef, cfg_stride, prompt_length,
847
- input_audio, original_transcript, transcript,
848
  ],
849
  outputs=[output_audio, success_output])
850
 
@@ -852,7 +872,7 @@ if __name__ == "__main__":
852
  inputs=[
853
  seed, sub_amount,
854
  aug_text, cfg_coef, cfg_stride, prompt_length,
855
- input_audio, original_transcript, transcript,
856
  ],
857
  outputs=[output_audio, success_output]
858
  )
@@ -892,15 +912,22 @@ if __name__ == "__main__":
892
 
893
  semgents = gr.State() # not used
894
  state = gr.State() # not used
 
 
 
 
 
 
 
895
  transcribe_btn.click(fn=transcribe_zh,
896
- inputs=[input_audio],
897
  outputs=[original_transcript, semgents, state, success_output])
898
 
899
  run_btn.click(fn=run_tts_zh,
900
  inputs=[
901
  seed, sub_amount,
902
  aug_text, cfg_coef, cfg_stride, prompt_length,
903
- input_audio, original_transcript, transcript,
904
  ],
905
  outputs=[output_audio, success_output])
906
 
@@ -908,7 +935,7 @@ if __name__ == "__main__":
908
  inputs=[
909
  seed, sub_amount,
910
  aug_text, cfg_coef, cfg_stride, prompt_length,
911
- input_audio, original_transcript, transcript,
912
  ],
913
  outputs=[output_audio, success_output]
914
  )
 
197
  transcribe_model_name = "medium"
198
  transcribe_model = load_model(transcribe_model_name, device, asr_options={"suppress_numerals": True, "max_new_tokens": None, "clip_timestamps": None, "hallucination_silence_threshold": None}, language=language)
199
  segments = transcribe_model.transcribe(audio_path, batch_size=8)["segments"]
 
 
200
  _, segments = align_zh(segments, audio_path)
201
  state = get_transcribe_state(segments)
202
  success_message = "<span style='color:green;'>Success: Transcribe completed successfully!</span>"
 
462
  sf.write(audio_path, audio, 16000)
463
 
464
  # text normalization
465
+ target_transcript = transcript.replace(" ", " ").replace(" ", " ").replace("\n", " ")
466
+ orig_transcript = original_transcript.replace(" ", " ").replace(" ", " ").replace("\n", " ")
467
 
468
  [orig_transcript, segments, _, _] = transcribe_zh(audio_path)
469
 
 
560
  sf.write(audio_path, audio, 16000)
561
 
562
  # text normalization
563
+ target_transcript = transcript.replace(" ", " ").replace(" ", " ").replace("\n", " ")
564
+ orig_transcript = original_transcript.replace(" ", " ").replace(" ", " ").replace("\n", " ")
565
 
566
  [orig_transcript, segments, _, _] = transcribe_zh(audio_path)
567
 
 
587
 
588
  audio, _ = librosa.load(audio_path, sr=16000, duration=cut_length)
589
  sf.write(audio_path, audio, 16000)
590
+ [orig_transcript, segments, _, _] = transcribe_zh(audio_path)
591
 
592
 
593
  converter = opencc.OpenCC('t2s')
 
722
 
723
  semgents = gr.State() # not used
724
  state = gr.State() # not used
725
+
726
+ audio_state = gr.State(value=f"{DEMO_PATH}/84_121550_000074_000000.wav")
727
+ input_audio.change(
728
+ lambda audio: audio,
729
+ inputs=[input_audio],
730
+ outputs=[audio_state]
731
+ )
732
+
733
  transcribe_btn.click(fn=transcribe_en,
734
+ inputs=[audio_state],
735
  outputs=[original_transcript, semgents, state, success_output])
736
 
737
  run_btn.click(fn=run_edit_en,
738
  inputs=[
739
  seed, sub_amount,
740
  aug_text, cfg_coef, cfg_stride, prompt_length,
741
+ audio_state, original_transcript, transcript,
742
  ],
743
  outputs=[output_audio, success_output])
744
 
 
746
  inputs=[
747
  seed, sub_amount,
748
  aug_text, cfg_coef, cfg_stride, prompt_length,
749
+ audio_state, original_transcript, transcript,
750
  ],
751
  outputs=[output_audio, success_output]
752
  )
 
786
 
787
  semgents = gr.State() # not used
788
  state = gr.State() # not used
789
+ audio_state = gr.State(value=f"{DEMO_PATH}/84_121550_000074_000000.wav")
790
+ input_audio.change(
791
+ lambda audio: audio,
792
+ inputs=[input_audio],
793
+ outputs=[audio_state]
794
+ )
795
+
796
  transcribe_btn.click(fn=transcribe_en,
797
+ inputs=[audio_state],
798
  outputs=[original_transcript, semgents, state, success_output])
799
 
800
  run_btn.click(fn=run_tts_en,
801
  inputs=[
802
  seed, sub_amount,
803
  aug_text, cfg_coef, cfg_stride, prompt_length,
804
+ audio_state, original_transcript, transcript,
805
  ],
806
  outputs=[output_audio, success_output])
807
 
 
809
  inputs=[
810
  seed, sub_amount,
811
  aug_text, cfg_coef, cfg_stride, prompt_length,
812
+ audio_state, original_transcript, transcript,
813
  ],
814
  outputs=[output_audio, success_output]
815
  )
 
849
 
850
  semgents = gr.State() # not used
851
  state = gr.State() # not used
852
+ audio_state = gr.State(value=f"{DEMO_PATH}/aishell3_test.wav")
853
+ input_audio.change(
854
+ lambda audio: audio,
855
+ inputs=[input_audio],
856
+ outputs=[audio_state]
857
+ )
858
+
859
  transcribe_btn.click(fn=transcribe_zh,
860
+ inputs=[audio_state],
861
  outputs=[original_transcript, semgents, state, success_output])
862
 
863
  run_btn.click(fn=run_edit_zh,
864
  inputs=[
865
  seed, sub_amount,
866
  aug_text, cfg_coef, cfg_stride, prompt_length,
867
+ audio_state, original_transcript, transcript,
868
  ],
869
  outputs=[output_audio, success_output])
870
 
 
872
  inputs=[
873
  seed, sub_amount,
874
  aug_text, cfg_coef, cfg_stride, prompt_length,
875
+ audio_state, original_transcript, transcript,
876
  ],
877
  outputs=[output_audio, success_output]
878
  )
 
912
 
913
  semgents = gr.State() # not used
914
  state = gr.State() # not used
915
+ audio_state = gr.State(value=f"{DEMO_PATH}/aishell3_test.wav")
916
+ input_audio.change(
917
+ lambda audio: audio,
918
+ inputs=[input_audio],
919
+ outputs=[audio_state]
920
+ )
921
+
922
  transcribe_btn.click(fn=transcribe_zh,
923
+ inputs=[audio_state],
924
  outputs=[original_transcript, semgents, state, success_output])
925
 
926
  run_btn.click(fn=run_tts_zh,
927
  inputs=[
928
  seed, sub_amount,
929
  aug_text, cfg_coef, cfg_stride, prompt_length,
930
+ audio_state, original_transcript, transcript,
931
  ],
932
  outputs=[output_audio, success_output])
933
 
 
935
  inputs=[
936
  seed, sub_amount,
937
  aug_text, cfg_coef, cfg_stride, prompt_length,
938
+ audio_state, original_transcript, transcript,
939
  ],
940
  outputs=[output_audio, success_output]
941
  )