Spaces:

lyimo
/

speech_separation

Runtime error

App Files Files Community

lyimo commited on Oct 29, 2024

Commit

95facbc

verified ·

1 Parent(s): 518eabe

Update app.py

Browse files

Files changed (1) hide show

app.py +123 -37

app.py CHANGED Viewed

@@ -1,44 +1,130 @@
 import gradio as gr
-import torch
 import torchaudio
-from speechbrain.inference.separation import SepformerSeparation as separator
 import os
-# Load the enhancement model
-model = separator.from_hparams(
-    source="speechbrain/sepformer-dns4-16k-enhancement",
-    savedir='pretrained_models/sepformer-dns4-16k-enhancement'
-)
-# Define the enhancement function
-def enhance_audio(noisy_audio):
-    # Convert MP3 to WAV
-    wav_audio = "temp_audio.wav"
-    torchaudio.save(wav_audio, *torchaudio.load(noisy_audio))
-    # Load and add a batch dimension to the audio tensor
-    noisy = model.load_audio(wav_audio).unsqueeze(0)
-    # Enhance the audio
-    enhanced = model.enhance_batch(noisy, lengths=torch.tensor([1.0]))
-    # Save enhanced audio to a file
-    enhanced_path = "enhanced.wav"
-    torchaudio.save(enhanced_path, enhanced.cpu(), 16000)
-    # Clean up the temporary audio file
-    os.remove(wav_audio)
-    return enhanced_path
-# Create the Gradio interface
-interface = gr.Interface(
-    fn=enhance_audio,
-    inputs=gr.Audio(type="filepath", label="Upload Noisy Audio"),
-    outputs=gr.Audio(type="filepath", label="Enhanced Audio"),
-    title="Speech Enhancement App",
-    description="Upload a noisy audio file to enhance the quality. The enhanced audio can be downloaded after processing."
-)
-# Launch the Gradio app with public link enabled
-interface.launch(share=True)

 import gradio as gr
 import torchaudio
+import torch
 import os
+from pydub import AudioSegment
+import tempfile
+from speechbrain.pretrained.separation import SepformerSeparation
+class AudioDenoiser:
+    def __init__(self):
+        # Initialize the SepFormer model for audio enhancement
+        self.model = SepformerSeparation.from_hparams(
+            source="speechbrain/sepformer-dns4-16k-enhancement",
+            savedir='pretrained_models/sepformer-dns4-16k-enhancement'
+        )
+        # Create output directory if it doesn't exist
+        os.makedirs("enhanced_audio", exist_ok=True)
+    def convert_audio_to_wav(self, input_path):
+        """
+        Convert any audio format to WAV with proper settings
+        Args:
+            input_path (str): Path to input audio file
+        Returns:
+            str: Path to converted WAV file
+        """
+        try:
+            # Create a temporary file for the converted audio
+            temp_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
+            temp_wav_path = temp_wav.name
+            # Load audio using pydub (supports multiple formats)
+            audio = AudioSegment.from_file(input_path)
+            # Convert to mono if stereo
+            if audio.channels > 1:
+                audio = audio.set_channels(1)
+            # Export as WAV with proper settings
+            audio.export(
+                temp_wav_path,
+                format='wav',
+                parameters=[
+                    '-ar', '16000',  # Set sample rate to 16kHz
+                    '-ac', '1'       # Set channels to mono
+                ]
+            )
+            return temp_wav_path
+        except Exception as e:
+            raise gr.Error(f"Error converting audio format: {str(e)}")
+    def enhance_audio(self, audio_path):
+        """
+        Process the input audio file and return the enhanced version
+        Args:
+            audio_path (str): Path to the input audio file
+        Returns:
+            str: Path to the enhanced audio file
+        """
+        try:
+            # Convert input audio to proper WAV format
+            wav_path = self.convert_audio_to_wav(audio_path)
+            # Separate and enhance the audio
+            est_sources = self.model.separate_file(path=wav_path)
+            # Generate output filename
+            output_path = os.path.join("enhanced_audio", "enhanced_audio.wav")
+            # Save the enhanced audio
+            torchaudio.save(
+                output_path,
+                est_sources[:, :, 0].detach().cpu(),
+                16000  # Sample rate
+            )
+            # Clean up temporary file
+            os.unlink(wav_path)
+            return output_path
+        except Exception as e:
+            raise gr.Error(f"Error processing audio: {str(e)}")
+def create_gradio_interface():
+    # Initialize the denoiser
+    denoiser = AudioDenoiser()
+    # Create the Gradio interface
+    interface = gr.Interface(
+        fn=denoiser.enhance_audio,
+        inputs=gr.Audio(
+            type="filepath",
+            label="Upload Noisy Audio"
+        ),
+        outputs=gr.Audio(
+            label="Enhanced Audio",
+            type="filepath"
+        ),
+        title="Audio Denoising using SepFormer",
+        description="""
+        This application uses the SepFormer model from SpeechBrain to enhance audio quality
+        by removing background noise. Supports various audio formats including MP3 and WAV.
+        """,
+        article="""
+        Supported audio formats:
+        - MP3
+        - WAV
+        - OGG
+        - FLAC
+        - M4A
+        and more...
+        The audio will automatically be converted to the correct format for processing.
+        """
+    )
+    return interface
+if __name__ == "__main__":
+    # Create and launch the interface
+    demo = create_gradio_interface()
+    demo.launch()