Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -197,8 +197,6 @@ def transcribe_zh(audio_path):
|
|
197 |
transcribe_model_name = "medium"
|
198 |
transcribe_model = load_model(transcribe_model_name, device, asr_options={"suppress_numerals": True, "max_new_tokens": None, "clip_timestamps": None, "hallucination_silence_threshold": None}, language=language)
|
199 |
segments = transcribe_model.transcribe(audio_path, batch_size=8)["segments"]
|
200 |
-
for segment in segments:
|
201 |
-
segment['text'] = replace_numbers_with_words(segment['text'])
|
202 |
_, segments = align_zh(segments, audio_path)
|
203 |
state = get_transcribe_state(segments)
|
204 |
success_message = "<span style='color:green;'>Success: Transcribe completed successfully!</span>"
|
@@ -464,8 +462,8 @@ def run_edit_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
|
464 |
sf.write(audio_path, audio, 16000)
|
465 |
|
466 |
# text normalization
|
467 |
-
target_transcript =
|
468 |
-
orig_transcript =
|
469 |
|
470 |
[orig_transcript, segments, _, _] = transcribe_zh(audio_path)
|
471 |
|
@@ -562,8 +560,8 @@ def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
|
562 |
sf.write(audio_path, audio, 16000)
|
563 |
|
564 |
# text normalization
|
565 |
-
target_transcript =
|
566 |
-
orig_transcript =
|
567 |
|
568 |
[orig_transcript, segments, _, _] = transcribe_zh(audio_path)
|
569 |
|
@@ -589,7 +587,7 @@ def run_tts_zh(seed, sub_amount, aug_text, cfg_coef, cfg_stride, prompt_length,
|
|
589 |
|
590 |
audio, _ = librosa.load(audio_path, sr=16000, duration=cut_length)
|
591 |
sf.write(audio_path, audio, 16000)
|
592 |
-
[orig_transcript, segments, _] = transcribe_zh(audio_path)
|
593 |
|
594 |
|
595 |
converter = opencc.OpenCC('t2s')
|
@@ -724,15 +722,23 @@ if __name__ == "__main__":
|
|
724 |
|
725 |
semgents = gr.State() # not used
|
726 |
state = gr.State() # not used
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
727 |
transcribe_btn.click(fn=transcribe_en,
|
728 |
-
inputs=[
|
729 |
outputs=[original_transcript, semgents, state, success_output])
|
730 |
|
731 |
run_btn.click(fn=run_edit_en,
|
732 |
inputs=[
|
733 |
seed, sub_amount,
|
734 |
aug_text, cfg_coef, cfg_stride, prompt_length,
|
735 |
-
|
736 |
],
|
737 |
outputs=[output_audio, success_output])
|
738 |
|
@@ -740,7 +746,7 @@ if __name__ == "__main__":
|
|
740 |
inputs=[
|
741 |
seed, sub_amount,
|
742 |
aug_text, cfg_coef, cfg_stride, prompt_length,
|
743 |
-
|
744 |
],
|
745 |
outputs=[output_audio, success_output]
|
746 |
)
|
@@ -780,15 +786,22 @@ if __name__ == "__main__":
|
|
780 |
|
781 |
semgents = gr.State() # not used
|
782 |
state = gr.State() # not used
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
783 |
transcribe_btn.click(fn=transcribe_en,
|
784 |
-
inputs=[
|
785 |
outputs=[original_transcript, semgents, state, success_output])
|
786 |
|
787 |
run_btn.click(fn=run_tts_en,
|
788 |
inputs=[
|
789 |
seed, sub_amount,
|
790 |
aug_text, cfg_coef, cfg_stride, prompt_length,
|
791 |
-
|
792 |
],
|
793 |
outputs=[output_audio, success_output])
|
794 |
|
@@ -796,7 +809,7 @@ if __name__ == "__main__":
|
|
796 |
inputs=[
|
797 |
seed, sub_amount,
|
798 |
aug_text, cfg_coef, cfg_stride, prompt_length,
|
799 |
-
|
800 |
],
|
801 |
outputs=[output_audio, success_output]
|
802 |
)
|
@@ -836,15 +849,22 @@ if __name__ == "__main__":
|
|
836 |
|
837 |
semgents = gr.State() # not used
|
838 |
state = gr.State() # not used
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
839 |
transcribe_btn.click(fn=transcribe_zh,
|
840 |
-
inputs=[
|
841 |
outputs=[original_transcript, semgents, state, success_output])
|
842 |
|
843 |
run_btn.click(fn=run_edit_zh,
|
844 |
inputs=[
|
845 |
seed, sub_amount,
|
846 |
aug_text, cfg_coef, cfg_stride, prompt_length,
|
847 |
-
|
848 |
],
|
849 |
outputs=[output_audio, success_output])
|
850 |
|
@@ -852,7 +872,7 @@ if __name__ == "__main__":
|
|
852 |
inputs=[
|
853 |
seed, sub_amount,
|
854 |
aug_text, cfg_coef, cfg_stride, prompt_length,
|
855 |
-
|
856 |
],
|
857 |
outputs=[output_audio, success_output]
|
858 |
)
|
@@ -892,15 +912,22 @@ if __name__ == "__main__":
|
|
892 |
|
893 |
semgents = gr.State() # not used
|
894 |
state = gr.State() # not used
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
895 |
transcribe_btn.click(fn=transcribe_zh,
|
896 |
-
inputs=[
|
897 |
outputs=[original_transcript, semgents, state, success_output])
|
898 |
|
899 |
run_btn.click(fn=run_tts_zh,
|
900 |
inputs=[
|
901 |
seed, sub_amount,
|
902 |
aug_text, cfg_coef, cfg_stride, prompt_length,
|
903 |
-
|
904 |
],
|
905 |
outputs=[output_audio, success_output])
|
906 |
|
@@ -908,7 +935,7 @@ if __name__ == "__main__":
|
|
908 |
inputs=[
|
909 |
seed, sub_amount,
|
910 |
aug_text, cfg_coef, cfg_stride, prompt_length,
|
911 |
-
|
912 |
],
|
913 |
outputs=[output_audio, success_output]
|
914 |
)
|
|
|
197 |
transcribe_model_name = "medium"
|
198 |
transcribe_model = load_model(transcribe_model_name, device, asr_options={"suppress_numerals": True, "max_new_tokens": None, "clip_timestamps": None, "hallucination_silence_threshold": None}, language=language)
|
199 |
segments = transcribe_model.transcribe(audio_path, batch_size=8)["segments"]
|
|
|
|
|
200 |
_, segments = align_zh(segments, audio_path)
|
201 |
state = get_transcribe_state(segments)
|
202 |
success_message = "<span style='color:green;'>Success: Transcribe completed successfully!</span>"
|
|
|
462 |
sf.write(audio_path, audio, 16000)
|
463 |
|
464 |
# text normalization
|
465 |
+
target_transcript = transcript.replace(" ", " ").replace(" ", " ").replace("\n", " ")
|
466 |
+
orig_transcript = original_transcript.replace(" ", " ").replace(" ", " ").replace("\n", " ")
|
467 |
|
468 |
[orig_transcript, segments, _, _] = transcribe_zh(audio_path)
|
469 |
|
|
|
560 |
sf.write(audio_path, audio, 16000)
|
561 |
|
562 |
# text normalization
|
563 |
+
target_transcript = transcript.replace(" ", " ").replace(" ", " ").replace("\n", " ")
|
564 |
+
orig_transcript = original_transcript.replace(" ", " ").replace(" ", " ").replace("\n", " ")
|
565 |
|
566 |
[orig_transcript, segments, _, _] = transcribe_zh(audio_path)
|
567 |
|
|
|
587 |
|
588 |
audio, _ = librosa.load(audio_path, sr=16000, duration=cut_length)
|
589 |
sf.write(audio_path, audio, 16000)
|
590 |
+
[orig_transcript, segments, _, _] = transcribe_zh(audio_path)
|
591 |
|
592 |
|
593 |
converter = opencc.OpenCC('t2s')
|
|
|
722 |
|
723 |
semgents = gr.State() # not used
|
724 |
state = gr.State() # not used
|
725 |
+
|
726 |
+
audio_state = gr.State(value=f"{DEMO_PATH}/84_121550_000074_000000.wav")
|
727 |
+
input_audio.change(
|
728 |
+
lambda audio: audio,
|
729 |
+
inputs=[input_audio],
|
730 |
+
outputs=[audio_state]
|
731 |
+
)
|
732 |
+
|
733 |
transcribe_btn.click(fn=transcribe_en,
|
734 |
+
inputs=[audio_state],
|
735 |
outputs=[original_transcript, semgents, state, success_output])
|
736 |
|
737 |
run_btn.click(fn=run_edit_en,
|
738 |
inputs=[
|
739 |
seed, sub_amount,
|
740 |
aug_text, cfg_coef, cfg_stride, prompt_length,
|
741 |
+
audio_state, original_transcript, transcript,
|
742 |
],
|
743 |
outputs=[output_audio, success_output])
|
744 |
|
|
|
746 |
inputs=[
|
747 |
seed, sub_amount,
|
748 |
aug_text, cfg_coef, cfg_stride, prompt_length,
|
749 |
+
audio_state, original_transcript, transcript,
|
750 |
],
|
751 |
outputs=[output_audio, success_output]
|
752 |
)
|
|
|
786 |
|
787 |
semgents = gr.State() # not used
|
788 |
state = gr.State() # not used
|
789 |
+
audio_state = gr.State(value=f"{DEMO_PATH}/84_121550_000074_000000.wav")
|
790 |
+
input_audio.change(
|
791 |
+
lambda audio: audio,
|
792 |
+
inputs=[input_audio],
|
793 |
+
outputs=[audio_state]
|
794 |
+
)
|
795 |
+
|
796 |
transcribe_btn.click(fn=transcribe_en,
|
797 |
+
inputs=[audio_state],
|
798 |
outputs=[original_transcript, semgents, state, success_output])
|
799 |
|
800 |
run_btn.click(fn=run_tts_en,
|
801 |
inputs=[
|
802 |
seed, sub_amount,
|
803 |
aug_text, cfg_coef, cfg_stride, prompt_length,
|
804 |
+
audio_state, original_transcript, transcript,
|
805 |
],
|
806 |
outputs=[output_audio, success_output])
|
807 |
|
|
|
809 |
inputs=[
|
810 |
seed, sub_amount,
|
811 |
aug_text, cfg_coef, cfg_stride, prompt_length,
|
812 |
+
audio_state, original_transcript, transcript,
|
813 |
],
|
814 |
outputs=[output_audio, success_output]
|
815 |
)
|
|
|
849 |
|
850 |
semgents = gr.State() # not used
|
851 |
state = gr.State() # not used
|
852 |
+
audio_state = gr.State(value=f"{DEMO_PATH}/aishell3_test.wav")
|
853 |
+
input_audio.change(
|
854 |
+
lambda audio: audio,
|
855 |
+
inputs=[input_audio],
|
856 |
+
outputs=[audio_state]
|
857 |
+
)
|
858 |
+
|
859 |
transcribe_btn.click(fn=transcribe_zh,
|
860 |
+
inputs=[audio_state],
|
861 |
outputs=[original_transcript, semgents, state, success_output])
|
862 |
|
863 |
run_btn.click(fn=run_edit_zh,
|
864 |
inputs=[
|
865 |
seed, sub_amount,
|
866 |
aug_text, cfg_coef, cfg_stride, prompt_length,
|
867 |
+
audio_state, original_transcript, transcript,
|
868 |
],
|
869 |
outputs=[output_audio, success_output])
|
870 |
|
|
|
872 |
inputs=[
|
873 |
seed, sub_amount,
|
874 |
aug_text, cfg_coef, cfg_stride, prompt_length,
|
875 |
+
audio_state, original_transcript, transcript,
|
876 |
],
|
877 |
outputs=[output_audio, success_output]
|
878 |
)
|
|
|
912 |
|
913 |
semgents = gr.State() # not used
|
914 |
state = gr.State() # not used
|
915 |
+
audio_state = gr.State(value=f"{DEMO_PATH}/aishell3_test.wav")
|
916 |
+
input_audio.change(
|
917 |
+
lambda audio: audio,
|
918 |
+
inputs=[input_audio],
|
919 |
+
outputs=[audio_state]
|
920 |
+
)
|
921 |
+
|
922 |
transcribe_btn.click(fn=transcribe_zh,
|
923 |
+
inputs=[audio_state],
|
924 |
outputs=[original_transcript, semgents, state, success_output])
|
925 |
|
926 |
run_btn.click(fn=run_tts_zh,
|
927 |
inputs=[
|
928 |
seed, sub_amount,
|
929 |
aug_text, cfg_coef, cfg_stride, prompt_length,
|
930 |
+
audio_state, original_transcript, transcript,
|
931 |
],
|
932 |
outputs=[output_audio, success_output])
|
933 |
|
|
|
935 |
inputs=[
|
936 |
seed, sub_amount,
|
937 |
aug_text, cfg_coef, cfg_stride, prompt_length,
|
938 |
+
audio_state, original_transcript, transcript,
|
939 |
],
|
940 |
outputs=[output_audio, success_output]
|
941 |
)
|