File size: 1,768 Bytes
fc81f0f
eb75e68
a29e651
eb75e68
 
76842df
a29e651
eb75e68
76842df
 
 
 
 
eb75e68
6c4aae6
fc81f0f
 
76842df
 
 
 
eb75e68
f74d062
eb75e68
3a62ed4
f74d062
 
eb75e68
9ac6d71
 
6c4aae6
c13028f
 
316bc64
 
bcee150
fc81f0f
 
eb75e68
fc81f0f
eb75e68
 
 
 
 
a29e651
4f912e8
eb75e68
7ca9435
eb75e68
 
61559e0
d2d93dc
244d52c
 
6c4aae6
3d1e219
eb75e68
6c4aae6
46ea61b
 
6b54c4d
46ea61b
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
"""Gradio demo for denoisers."""
import gradio as gr
import numpy as np
import torch
import torchaudio
from denoisers import UNet1DModel, WaveUNetModel
from tqdm import tqdm

MODELS = [
    "wrice/unet1d-vctk-48khz",
    "wrice/waveunet-vctk-48khz",
    "wrice/waveunet-vctk-24khz",
]


def denoise(model_name, inputs):
    """Denoise audio."""
    if "unet1d" in model_name:
        model = UNet1DModel.from_pretrained(model_name)
    else:
        model = WaveUNetModel.from_pretrained(model_name)
    sr, audio = inputs
    audio = torch.from_numpy(audio)
    audio = audio / 32768.0

    if audio.ndim == 1:
        audio = audio.unsqueeze(0)

    print(f"Audio shape: {audio.shape}")
    print(f"Sample rate: {sr}")

    if audio.shape[0] > 1:
        audio = audio.mean(0, keepdim=True)

    print(f"Audio shape: {audio.shape}")

    if sr != model.config.sample_rate:
        audio = torchaudio.functional.resample(audio, sr, model.config.sample_rate)

    chunk_size = model.config.max_length

    padding = abs(audio.size(-1) % chunk_size - chunk_size)
    padded = torch.nn.functional.pad(audio, (0, padding))

    clean = []
    for i in tqdm(range(0, padded.shape[-1], chunk_size)):
        audio_chunk = padded[:, :, i : i + chunk_size]
        with torch.no_grad():
            clean_chunk = model(audio_chunk).audio
        clean.append(clean_chunk.squeeze(0))

    denoised = torch.concat(clean, 1)[:, : audio.shape[-1]].clamp(-1.0, 1.0)
    denoised = (denoised * 32767.0).numpy().astype(np.int16)

    print(f"Denoised shape: {denoised.shape}")

    return model.config.sample_rate, denoised.transpose()


iface = gr.Interface(
    fn=denoise,
    inputs=[gr.Dropdown(choices=MODELS, value=MODELS[0]), "audio"],
    outputs="audio",
)
iface.launch()