Spaces:

lyimo
/

speech_separation

Runtime error

App Files Files Community

lyimo commited on Oct 28, 2024

Commit

3d8b9ae

verified ·

1 Parent(s): 17508a1

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -112

app.py CHANGED Viewed

@@ -5,133 +5,84 @@ import os
 from pydub import AudioSegment
 import tempfile
 from speechbrain.pretrained.separation import SepformerSeparation
-import numpy as np
-import threading
-from queue import Queue
-import time
-class RealtimeAudioDenoiser:
     def __init__(self):
-        # Initialize the model
         self.model = SepformerSeparation.from_hparams(
             source="speechbrain/sepformer-dns4-16k-enhancement",
             savedir='pretrained_models/sepformer-dns4-16k-enhancement'
         )
-        # Move model to GPU if available
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.model.to(self.device)
-        # Enable inference mode for better performance
-        self.model.eval()
-        torch.set_grad_enabled(False)
-        # Set chunk size for streaming (500ms chunks)
-        self.chunk_duration = 0.5  # seconds
-        self.sample_rate = 16000
-        self.chunk_size = int(self.sample_rate * self.chunk_duration)
-        # Initialize processing queue and buffer
-        self.processing_queue = Queue()
-        self.output_buffer = Queue()
-        self.is_processing = False
-        # Start processing thread
-        self.processing_thread = threading.Thread(target=self._process_queue)
-        self.processing_thread.daemon = True
-        self.processing_thread.start()
-        # Create output directory
         os.makedirs("enhanced_audio", exist_ok=True)
-    def _optimize_model(self):
-        """Optimize model for inference"""
-        if self.device.type == 'cuda':
-            # Use mixed precision for faster processing
-            self.model = torch.quantization.quantize_dynamic(
-                self.model, {torch.nn.Linear}, dtype=torch.qint8
-            )
-            torch.backends.cudnn.benchmark = True
-    def _process_queue(self):
-        """Background thread for processing audio chunks"""
-        while True:
-            if not self.processing_queue.empty():
-                chunk = self.processing_queue.get()
-                if chunk is None:
-                    continue
-                # Process audio chunk
-                enhanced_chunk = self._enhance_chunk(chunk)
-                self.output_buffer.put(enhanced_chunk)
-            else:
-                time.sleep(0.01)  # Small delay to prevent CPU overuse
-    def _enhance_chunk(self, audio_chunk):
-        """Process a single chunk of audio"""
         try:
-            # Convert to tensor and move to device
-            chunk_tensor = torch.FloatTensor(audio_chunk).to(self.device)
-            chunk_tensor = chunk_tensor.unsqueeze(0)  # Add batch dimension
-            # Process with model
-            with torch.inference_mode():
-                enhanced = self.model.separate_batch(chunk_tensor)
-                enhanced = enhanced.squeeze(0).cpu().numpy()
-            return enhanced
         except Exception as e:
-            print(f"Error processing chunk: {str(e)}")
-            return audio_chunk
-    def process_stream(self, audio_path):
         """
-        Process audio in streaming fashion
         """
         try:
-            # Convert input audio to proper format
-            audio = AudioSegment.from_file(audio_path)
-            audio = audio.set_frame_rate(self.sample_rate)
-            audio = audio.set_channels(1)
-            # Convert to numpy array
-            samples = np.array(audio.get_array_of_samples(), dtype=np.float32)
-            samples = samples / np.max(np.abs(samples))  # Normalize
-            # Process in chunks
-            enhanced_chunks = []
-            for i in range(0, len(samples), self.chunk_size):
-                chunk = samples[i:i + self.chunk_size]
-                # Pad last chunk if necessary
-                if len(chunk) < self.chunk_size:
-                    chunk = np.pad(chunk, (0, self.chunk_size - len(chunk)))
-                # Add to processing queue
-                self.processing_queue.put(chunk)
-            # Wait for all chunks to be processed
-            while self.processing_queue.qsize() > 0 or self.output_buffer.qsize() > 0:
-                if not self.output_buffer.empty():
-                    enhanced_chunks.append(self.output_buffer.get())
-                time.sleep(0.01)
-            # Combine chunks
-            enhanced_audio = np.concatenate(enhanced_chunks)
-            # Save enhanced audio
-            output_path = os.path.join("enhanced_audio", "enhanced_realtime.wav")
-            enhanced_audio = enhanced_audio * 32767  # Convert to int16 range
-            enhanced_audio = enhanced_audio.astype(np.int16)
-            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
-                torchaudio.save(
-                    f.name,
-                    torch.tensor(enhanced_audio).unsqueeze(0),
-                    self.sample_rate
-                )
-                os.replace(f.name, output_path)
             return output_path
@@ -140,11 +91,11 @@ class RealtimeAudioDenoiser:
 def create_gradio_interface():
     # Initialize the denoiser
-    denoiser = RealtimeAudioDenoiser()
     # Create the Gradio interface
     interface = gr.Interface(
-        fn=denoiser.process_stream,
         inputs=gr.Audio(
             type="filepath",
             label="Upload Noisy Audio"
@@ -153,10 +104,21 @@ def create_gradio_interface():
             label="Enhanced Audio",
             type="filepath"
         ),
-        title="Real-time Audio Denoising using SepFormer",
         description="""
-        Optimized for real-time processing with low latency.
-        Processes audio in 500ms chunks for streaming applications.
         """
     )

 from pydub import AudioSegment
 import tempfile
 from speechbrain.pretrained.separation import SepformerSeparation
+class AudioDenoiser:
     def __init__(self):
+        # Initialize the SepFormer model for audio enhancement
         self.model = SepformerSeparation.from_hparams(
             source="speechbrain/sepformer-dns4-16k-enhancement",
             savedir='pretrained_models/sepformer-dns4-16k-enhancement'
         )
+        # Create output directory if it doesn't exist
         os.makedirs("enhanced_audio", exist_ok=True)
+    def convert_audio_to_wav(self, input_path):
+        """
+        Convert any audio format to WAV with proper settings
+        Args:
+            input_path (str): Path to input audio file
+        Returns:
+            str: Path to converted WAV file
+        """
         try:
+            # Create a temporary file for the converted audio
+            temp_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
+            temp_wav_path = temp_wav.name
+            # Load audio using pydub (supports multiple formats)
+            audio = AudioSegment.from_file(input_path)
+            # Convert to mono if stereo
+            if audio.channels > 1:
+                audio = audio.set_channels(1)
+            # Export as WAV with proper settings
+            audio.export(
+                temp_wav_path,
+                format='wav',
+                parameters=[
+                    '-ar', '16000',  # Set sample rate to 16kHz
+                    '-ac', '1'       # Set channels to mono
+                ]
+            )
+            return temp_wav_path
         except Exception as e:
+            raise gr.Error(f"Error converting audio format: {str(e)}")
+    def enhance_audio(self, audio_path):
         """
+        Process the input audio file and return the enhanced version
+        Args:
+            audio_path (str): Path to the input audio file
+        Returns:
+            str: Path to the enhanced audio file
         """
         try:
+            # Convert input audio to proper WAV format
+            wav_path = self.convert_audio_to_wav(audio_path)
+            # Separate and enhance the audio
+            est_sources = self.model.separate_file(path=wav_path)
+            # Generate output filename
+            output_path = os.path.join("enhanced_audio", "enhanced_audio.wav")
+            # Save the enhanced audio
+            torchaudio.save(
+                output_path,
+                est_sources[:, :, 0].detach().cpu(),
+                16000  # Sample rate
+            )
+            # Clean up temporary file
+            os.unlink(wav_path)
             return output_path
 def create_gradio_interface():
     # Initialize the denoiser
+    denoiser = AudioDenoiser()
     # Create the Gradio interface
     interface = gr.Interface(
+        fn=denoiser.enhance_audio,
         inputs=gr.Audio(
             type="filepath",
             label="Upload Noisy Audio"
             label="Enhanced Audio",
             type="filepath"
         ),
+        title="Audio Denoising using SepFormer",
         description="""
+        This application uses the SepFormer model from SpeechBrain to enhance audio quality
+        by removing background noise. Supports various audio formats including MP3 and WAV.
+        """,
+        article="""
+        Supported audio formats:
+        - MP3
+        - WAV
+        - OGG
+        - FLAC
+        - M4A
+        and more...
+        The audio will automatically be converted to the correct format for processing.
         """
     )