Spaces:
Sleeping
Sleeping
File size: 3,135 Bytes
957e2dc 678fd0b 957e2dc 678fd0b 957e2dc 678fd0b 957e2dc b6809ba 678fd0b 957e2dc 678fd0b 957e2dc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
import torch
import torchaudio
import voicebox.src.attacks.offline.perturbation.voicebox.voicebox as vb #To access VoiceBox class
#import voicebox.src.attacks.online.voicebox_streamer as streamer #To access VoiceBoxStreamer class
import numpy as np
from voicebox.src.constants import PPG_PRETRAINED_PATH
from voicebox.src.models import ResNetSE34V2
#Set voicebox default parameters
LOOKAHEAD = 5
voicebox_kwargs={'win_length': 256,
'ppg_encoder_hidden_size': 256,
'use_phoneme_encoder': True,
'use_pitch_encoder': True,
'use_loudness_encoder': True,
'spec_encoder_lookahead_frames': 0,
'spec_encoder_type': 'mel',
'spec_encoder_mlp_depth': 2,
'bottleneck_lookahead_frames': LOOKAHEAD,
'ppg_encoder_path': PPG_PRETRAINED_PATH,
'n_bands': 128,
'spec_encoder_hidden_size': 512,
'bottleneck_skip': True,
'bottleneck_hidden_size': 512,
'bottleneck_feedforward_size': 512,
'bottleneck_type': 'lstm',
'bottleneck_depth': 2,
'control_eps': 0.5,
'projection_norm': float('inf'),
'conditioning_dim': 512}
'''
#Set streamer default parameters:
config_path = 'voicebox/pretrained/voicebox/voicebox_final.yaml'
with open(config_path) as f:
config = yaml.safe_load(f)
#Load pretrained model (streamer):
model = streamer.VoiceBoxStreamer(**config)
model.load_state_dict(torch.load('voicebox/pretrained/voicebox/voicebox_final.pt', map_location=torch.device('cpu')), strict=True)
model.eval()
'''
#Load pretrained model (VoiceBox):
model = vb.VoiceBox(**voicebox_kwargs)
model.load_state_dict(torch.load('voicebox/pretrained/voicebox/voicebox_final.pt', map_location=torch.device('cpu')), strict=True)
model.eval()
#Define function to convert final audio format:
def float32_to_int16(waveform):
waveform = waveform / np.abs(waveform).max()
waveform = waveform * 32767
waveform = waveform.astype(np.int16)
waveform = waveform.ravel()
return waveform
def get_embedding(recording):
resnet = ResNetSE34V2(nOut=512, encoder_type='ASP')
recording = recording.view(1, -1)
embedding = resnet(recording)
return embedding
#Define predict function:
def predict(inp):
#How to transform audio from string to tensor
waveform, sample_rate = torchaudio.load(inp)
#Resample to 16kHz
transform_to_16hz = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
waveform = transform_to_16hz(waveform)
sample_rate = 16000
#Get speaker embedding
condition_tensor = get_embedding(waveform)
condition_tensor = condition_tensor.reshape(1, 1, -1)
n_frames = waveform.shape[1]
condition_tensor = condition_tensor.repeat(1, n_frames, 1)
#Run model without changing weights
with torch.no_grad():
waveform = model(x=waveform, y=condition_tensor)
#Transform output audio into gradio-readable format
waveform = waveform.numpy()
waveform = float32_to_int16(waveform)
return sample_rate, waveform
#Set up gradio interface
import gradio as gr
interface = gr.Interface(
fn=predict,
inputs=gr.Audio(type="filepath"),
outputs=gr.Audio()
)
interface.launch() |