File size: 2,256 Bytes
957e2dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b6809ba
 
 
 
 
957e2dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import torch
import torchaudio
import voicebox.src.attacks.offline.perturbation.voicebox.voicebox as vb #To access VoiceBox class
#import voicebox.src.attacks.online.voicebox_streamer as streamer #To access VoiceBoxStreamer class
import numpy as np
from voicebox.src.constants import PPG_PRETRAINED_PATH

#Set voicebox default parameters
LOOKAHEAD = 5
voicebox_kwargs={'win_length': 256,
    'ppg_encoder_hidden_size': 256,
    'use_phoneme_encoder': True,
    'use_pitch_encoder': True,
    'use_loudness_encoder': True,
    'spec_encoder_lookahead_frames': 0,
    'spec_encoder_type': 'mel',
    'spec_encoder_mlp_depth': 2,
    'bottleneck_lookahead_frames': LOOKAHEAD,
    'ppg_encoder_path': PPG_PRETRAINED_PATH,
    'n_bands': 128,
    'spec_encoder_hidden_size': 512,
    'bottleneck_skip': True,
    'bottleneck_hidden_size': 512,
    'bottleneck_feedforward_size': 512,
    'bottleneck_type': 'lstm',
    'bottleneck_depth': 2,
    'control_eps': 0.5,
    'projection_norm': float('inf'),
    'conditioning_dim': 512}

#Load pretrained model:
model = vb.VoiceBox(**voicebox_kwargs)
model.load_state_dict(torch.load('voicebox/pretrained/voicebox/voicebox_final.pt', map_location=torch.device('cpu')), strict=True)
model.eval()

#Define function to convert final audio format:
def float32_to_int16(waveform):
    waveform = waveform / np.abs(waveform).max()
    waveform = waveform * 32767
    waveform = waveform.astype(np.int16)
    waveform = waveform.ravel()
    return waveform

#Define predict function:
def predict(inp):
    #How to transform audio from string to tensor
    waveform, sample_rate = torchaudio.load(inp)

    #Resample to 16kHz
    transform_to_16hz = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform = transform_to_16hz(waveform)
    sample_rate = 16000

    #Run model without changing weights
    with torch.no_grad():
        waveform = model(waveform)

    #Transform output audio into gradio-readable format
    waveform = waveform.numpy()
    waveform = float32_to_int16(waveform)
    return sample_rate, waveform

#Set up gradio interface
import gradio as gr

interface = gr.Interface(
    fn=predict,
    inputs=gr.Audio(type="filepath"),
    outputs=gr.Audio()
)

interface.launch()