Spaces:

ALeLacheur
/

voiceblock

Sleeping

App Files Files Community

ALeLacheur commited on Jul 10, 2024

Commit

678fd0b

1 Parent(s): b6809ba

Changed to 16hz and added speaker embedding

Browse files

Files changed (1) hide show

app.py +27 -2

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import voicebox.src.attacks.offline.perturbation.voicebox.voicebox as vb #To acc
 #import voicebox.src.attacks.online.voicebox_streamer as streamer #To access VoiceBoxStreamer class
 import numpy as np
 from voicebox.src.constants import PPG_PRETRAINED_PATH
 #Set voicebox default parameters
 LOOKAHEAD = 5
@@ -28,7 +29,19 @@ voicebox_kwargs={'win_length': 256,
     'projection_norm': float('inf'),
     'conditioning_dim': 512}
-#Load pretrained model:
 model = vb.VoiceBox(**voicebox_kwargs)
 model.load_state_dict(torch.load('voicebox/pretrained/voicebox/voicebox_final.pt', map_location=torch.device('cpu')), strict=True)
 model.eval()
@@ -41,6 +54,12 @@ def float32_to_int16(waveform):
     waveform = waveform.ravel()
     return waveform
 #Define predict function:
 def predict(inp):
     #How to transform audio from string to tensor
@@ -51,9 +70,15 @@ def predict(inp):
     waveform = transform_to_16hz(waveform)
     sample_rate = 16000
     #Run model without changing weights
     with torch.no_grad():
-        waveform = model(waveform)
     #Transform output audio into gradio-readable format
     waveform = waveform.numpy()

 #import voicebox.src.attacks.online.voicebox_streamer as streamer #To access VoiceBoxStreamer class
 import numpy as np
 from voicebox.src.constants import PPG_PRETRAINED_PATH
+from voicebox.src.models import ResNetSE34V2
 #Set voicebox default parameters
 LOOKAHEAD = 5
     'projection_norm': float('inf'),
     'conditioning_dim': 512}
+'''
+#Set streamer default parameters:
+config_path = 'voicebox/pretrained/voicebox/voicebox_final.yaml'
+with open(config_path) as f:
+    config = yaml.safe_load(f)
+#Load pretrained model (streamer):
+model = streamer.VoiceBoxStreamer(**config)
+model.load_state_dict(torch.load('voicebox/pretrained/voicebox/voicebox_final.pt', map_location=torch.device('cpu')), strict=True)
+model.eval()
+'''
+#Load pretrained model (VoiceBox):
 model = vb.VoiceBox(**voicebox_kwargs)
 model.load_state_dict(torch.load('voicebox/pretrained/voicebox/voicebox_final.pt', map_location=torch.device('cpu')), strict=True)
 model.eval()
     waveform = waveform.ravel()
     return waveform
+def get_embedding(recording):
+    resnet = ResNetSE34V2(nOut=512, encoder_type='ASP')
+    recording = recording.view(1, -1)
+    embedding = resnet(recording)
+    return embedding
 #Define predict function:
 def predict(inp):
     #How to transform audio from string to tensor
     waveform = transform_to_16hz(waveform)
     sample_rate = 16000
+    #Get speaker embedding
+    condition_tensor = get_embedding(waveform)
+    condition_tensor = condition_tensor.reshape(1, 1, -1)
+    n_frames = waveform.shape[1]
+    condition_tensor = condition_tensor.repeat(1, n_frames, 1)
     #Run model without changing weights
     with torch.no_grad():
+        waveform = model(x=waveform, y=condition_tensor)
     #Transform output audio into gradio-readable format
     waveform = waveform.numpy()