from logging import getLogger import gradio as gr import torch import torchaudio from denoisers import WaveUNetModel LOGGER = getLogger(__name__) MODEL = WaveUNetModel.from_pretrained("wrice/waveunet-vctk-24khz") def denoise(inputs): sr, audio = inputs audio = torch.from_numpy(audio)[None] audio = audio / 32768.0 LOGGER.info(f"Audio shape: {audio.shape}") LOGGER.info(f"Sample rate: {sr}") if sr != MODEL.config.sample_rate: audio = torchaudio.functional.resample(audio, sr, MODEL.config.sample_rate) chunk_size = MODEL.config.max_length padding = abs(audio.size(-1) % chunk_size - chunk_size) padded = torch.nn.functional.pad(audio, (0, padding)) clean = [] for i in range(0, padded.shape[-1], chunk_size): audio_chunk = padded[:, i : i + chunk_size] with torch.no_grad(): clean_chunk = MODEL(audio_chunk[None]).logits clean.append(clean_chunk.squeeze(0)) denoised = torch.concat(clean)[:, : audio.shape[-1]].squeeze().clamp(-1.0, 1.0) denoised = (denoised * 32767.0).numpy().astype("int16") LOGGER.info(f"Denoised shape: {denoised.shape}") return MODEL.config.sample_rate, denoised iface = gr.Interface(fn=denoise, inputs="audio", outputs="audio") iface.launch()