import torch import soundfile as sf import gradio as gr import spaces from clearvoice import ClearVoice import os @spaces.GPU def fn_clearvoice_se(input_wav, sr): if sr == "16000": myClearVoice = ClearVoice(task='speech_enhancement', model_names=['FRCRN_SE_16K']) fs = 16000 else: myClearVoice = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K']) fs = 48000 output_wav_dict = myClearVoice(input_path=input_wav, online_write=False) if isinstance(output_wav_dict, dict): key = next(iter(output_wav_dict)) output_wav = output_wav_dict[key] else: output_wav = output_wav_dict sf.write('enhanced.wav', output_wav, fs) return 'enhanced.wav' @spaces.GPU def fn_clearvoice_ss(input_wav): myClearVoice = ClearVoice(task='speech_separation', model_names=['MossFormer2_SS_16K']) output_wav_dict = myClearVoice(input_path=input_wav, online_write=False) if isinstance(output_wav_dict, dict): key = next(iter(output_wav_dict)) output_wav_list = output_wav_dict[key] output_wav_s1 = output_wav_list[0] output_wav_s2 = output_wav_list[1] else: output_wav_list = output_wav_dict output_wav_s1 = output_wav_list[0] output_wav_s2 = output_wav_list[1] sf.write('separated_s1.wav', output_wav_s1, 16000) sf.write('separated_s2.wav', output_wav_s2, 16000) return "separated_s1.wav", "separated_s2.wav" def find_mp4_files(directory): mp4_files = [] # Walk through the directory and its subdirectories for root, dirs, files in os.walk(directory): for file in files: # Check if the file ends with .mp4 if file.endswith(".mp4") and file[:3] == 'est': mp4_files.append(os.path.join(root, file)) return mp4_files @spaces.GPU() def fn_clearvoice_tse(input_video): myClearVoice = ClearVoice(task='target_speaker_extraction', model_names=['AV_MossFormer2_TSE_16K']) #output_wav_dict = print(f'input_video: {input_video}') myClearVoice(input_path=input_video, online_write=True, output_path='path_to_output_videos_tse') output_list = find_mp4_files(f'path_to_output_videos_tse/AV_MossFormer2_TSE_16K/{os.path.basename(input_video).split(".")[0]}/') return output_list demo = gr.Blocks() se_demo = gr.Interface( fn=fn_clearvoice_se, inputs = [ gr.Audio(label="Input Audio", type="filepath"), gr.Dropdown( ["16000", "48000"], value=["16000"], multiselect=False, label="Sampling Rate", info="Choose the sampling rate for your output." ), ], outputs = [ gr.Audio(label="Output Audio", type="filepath"), ], title = "ClearVoice: Speech Enhancement", description = ("ClearVoice is AI-powered and extracts clear speech from background noise for enhanced speech quality. It supports both 16 kHz and 48 kHz audio outputs. " "To try it, simply upload your audio, or click one of the examples. "), article = ("
FRCRN: Boosting Feature Representation Using Frequency Recurrence for Monaural Speech Enhancement | Github Repo
" "MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation | Github Repo
"), examples = [ ["examples/mandarin_speech_16kHz.wav", "16000"], ["examples/english_speech_48kHz.wav", "48000"], ], cache_examples = True, ) ss_demo = gr.Interface( fn=fn_clearvoice_ss, inputs = [ gr.Audio(label="Input Audio", type="filepath"), ], outputs = [ gr.Audio(label="Output Audio", type="filepath"), gr.Audio(label="Output Audio", type="filepath"), ], title = "ClearVoice: Speech Separation", description = ("ClearVoice is powered by AI and separates individual speech from mixed audio. It supports 16 kHz and two output streams. "To try it, simply upload your audio, or click one of the examples. "), article = ("MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions | Github Repo
" "MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation | Github Repo
"), examples = [ ['examples/female_female_speech.wav'], ['examples/female_male_speech.wav'], ], cache_examples = True, ) tse_demo = gr.Interface( fn=fn_clearvoice_tse, inputs = [ gr.Video(label="Input Video"), ], outputs = [ gr.Gallery(label="Output Video List") ], title = "ClearVoice: Audio-Visual Speaker Extraction", description = ("ClearVoice is AI-powered and extracts each speaker's voice from a multi-speaker video using facial recognition. " "To try it, simply upload your video, or click one of the examples. "), # article = ("MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions | Github Repo
" # "MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation | Github Repo
"), examples = [ ['examples/001.mp4'], ['examples/002.mp4'], ], cache_examples = True, ) with demo: gr.TabbedInterface([se_demo, ss_demo, tse_demo], ["Task 1: Speech Enhancement", "Task 2: Speech Separation", "Task 3: Audio-Visual Speaker Extraction"]) demo.launch()