alibabasglab commited on
Commit
8310825
·
verified ·
1 Parent(s): 10c182e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -12
app.py CHANGED
@@ -77,11 +77,10 @@ se_demo = gr.Interface(
77
  gr.Audio(label="Output Audio", type="filepath"),
78
  ],
79
  title = "ClearVoice: Speech Enhancement",
80
- description = ("Gradio demo for Speech enhancement with ClearVoice. The models support audios with 16 kHz (FRCRN backbone) and 48 kHz (MossFormer2 backbone) sampling rates. "
81
- "We provide the generalized models trained on large scale of data for handling various of background environments. "
82
- "To test it, simply upload your audio, or click one of the examples to load them. Read more at the links below."),
83
  article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2206.07293' target='_blank'>FRCRN: Boosting Feature Representation Using Frequency Recurrence for Monaural Speech Enhancement</a> | <a href='https://github.com/alibabasglab/FRCRN' target='_blank'>Github Repo</a></p>"
84
- ),
85
  examples = [
86
  ["examples/mandarin_speech_16kHz.wav", "16000"],
87
  ["examples/english_speech_48kHz.wav", "48000"],
@@ -99,9 +98,8 @@ ss_demo = gr.Interface(
99
  gr.Audio(label="Output Audio", type="filepath"),
100
  ],
101
  title = "ClearVoice: Speech Separation",
102
- description = ("Gradio demo for Speech separation with ClearVoice. The model (MossFormer2 backbone) supports 2 speakers' audio mixtures with 16 kHz sampling rate. "
103
- "We provide the generalized models trained on large scale of data for handling independent speakers and various of background environments. "
104
- "To test it, simply upload your audio, or click one of the examples to load them. Read more at the links below."),
105
  article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> | <a href='https://github.com/alibabasglab/MossFormer' target='_blank'>Github Repo</a></p>"
106
  "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> | <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"),
107
  examples = [
@@ -119,9 +117,9 @@ tse_demo = gr.Interface(
119
  outputs = [
120
  gr.Gallery(label="Output Video List")
121
  ],
122
- title = "ClearVoice: Audio-visual speaker extraction",
123
- description = ("Gradio demo for audio-visual speaker extraction with ClearVoice."
124
- "To test it, simply upload your video, or click one of the examples to load them. Read more at the links below."),
125
  # article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> | <a href='https://github.com/alibabasglab/MossFormer' target='_blank'>Github Repo</a></p>"
126
  # "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> | <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"),
127
  examples = [
@@ -132,7 +130,6 @@ tse_demo = gr.Interface(
132
  )
133
 
134
  with demo:
135
- #gr.TabbedInterface([se_demo], ["Speech Enhancement"])
136
- gr.TabbedInterface([se_demo, ss_demo, tse_demo], ["Demo_1: Speech Enhancement", "Demo_2: Speech Separation", "Demo_3: Audio-visual Speaker Extraction"])
137
 
138
  demo.launch()
 
77
  gr.Audio(label="Output Audio", type="filepath"),
78
  ],
79
  title = "ClearVoice: Speech Enhancement",
80
+ description = ("ClearVoice is AI-powered and extracts clear speech from background noise for enhanced speech quality. It supports both 16 kHz and 48 kHz audio outputs. "
81
+ "To try it, simply upload your audio, or click one of the examples. "),
 
82
  article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2206.07293' target='_blank'>FRCRN: Boosting Feature Representation Using Frequency Recurrence for Monaural Speech Enhancement</a> | <a href='https://github.com/alibabasglab/FRCRN' target='_blank'>Github Repo</a></p>"
83
+ "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> | <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"),
84
  examples = [
85
  ["examples/mandarin_speech_16kHz.wav", "16000"],
86
  ["examples/english_speech_48kHz.wav", "48000"],
 
98
  gr.Audio(label="Output Audio", type="filepath"),
99
  ],
100
  title = "ClearVoice: Speech Separation",
101
+ description = ("ClearVoice is powered by AI and separates individual speech from mixed audio. It supports 16 kHz and two output streams.
102
+ "To try it, simply upload your audio, or click one of the examples. "),
 
103
  article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> | <a href='https://github.com/alibabasglab/MossFormer' target='_blank'>Github Repo</a></p>"
104
  "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> | <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"),
105
  examples = [
 
117
  outputs = [
118
  gr.Gallery(label="Output Video List")
119
  ],
120
+ title = "ClearVoice: Audio-Visual Speaker Extraction",
121
+ description = ("ClearVoice is AI-powered and extracts each speaker's voice from a multi-speaker video using facial recognition. "
122
+ "To try it, simply upload your video, or click one of the examples. "),
123
  # article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> | <a href='https://github.com/alibabasglab/MossFormer' target='_blank'>Github Repo</a></p>"
124
  # "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> | <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"),
125
  examples = [
 
130
  )
131
 
132
  with demo:
133
+ gr.TabbedInterface([se_demo, ss_demo, tse_demo], ["Task 1: Speech Enhancement", "Task 2: Speech Separation", "Task 3: Audio-Visual Speaker Extraction"])
 
134
 
135
  demo.launch()