ALeLacheur commited on
Commit
d09f267
·
1 Parent(s): 6caf132

Bug fix: Speaker embedding

Browse files
Files changed (1) hide show
  1. app.py +0 -7
app.py CHANGED
@@ -55,39 +55,32 @@ def float32_to_int16(waveform):
55
  return waveform
56
 
57
  def get_embedding(recording):
58
- print("Getting ResNet")
59
  resnet = ResNetSE34V2(nOut=512, encoder_type='ASP')
60
  recording = recording.view(1, -1)
61
- print("Running ResNet")
62
  embedding = resnet(recording)
63
  return embedding
64
 
65
  #Define predict function:
66
  def predict(inp):
67
  #How to transform audio from string to tensor
68
- print("Transforming audio to tensor")
69
  waveform, sample_rate = torchaudio.load(inp)
70
 
71
  #Resample to 16kHz
72
- print("Resampling to 16Hz")
73
  transform_to_16hz = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
74
  waveform = transform_to_16hz(waveform)
75
  sample_rate = 16000
76
 
77
  #Get speaker embedding
78
- print("Getting speaker embedding")
79
  condition_tensor = get_embedding(waveform)
80
  condition_tensor = condition_tensor.reshape(1, 1, -1)
81
  n_frames = waveform.shape[1]
82
  condition_tensor = condition_tensor.repeat(1, n_frames, 1)
83
 
84
  #Run model without changing weights
85
- print("Running the model")
86
  with torch.no_grad():
87
  waveform = model(x=waveform, y=condition_tensor)
88
 
89
  #Transform output audio into gradio-readable format
90
- print("Transforming returned audio")
91
  waveform = waveform.numpy()
92
  waveform = float32_to_int16(waveform)
93
  return sample_rate, waveform
 
55
  return waveform
56
 
57
  def get_embedding(recording):
 
58
  resnet = ResNetSE34V2(nOut=512, encoder_type='ASP')
59
  recording = recording.view(1, -1)
 
60
  embedding = resnet(recording)
61
  return embedding
62
 
63
  #Define predict function:
64
  def predict(inp):
65
  #How to transform audio from string to tensor
 
66
  waveform, sample_rate = torchaudio.load(inp)
67
 
68
  #Resample to 16kHz
 
69
  transform_to_16hz = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
70
  waveform = transform_to_16hz(waveform)
71
  sample_rate = 16000
72
 
73
  #Get speaker embedding
 
74
  condition_tensor = get_embedding(waveform)
75
  condition_tensor = condition_tensor.reshape(1, 1, -1)
76
  n_frames = waveform.shape[1]
77
  condition_tensor = condition_tensor.repeat(1, n_frames, 1)
78
 
79
  #Run model without changing weights
 
80
  with torch.no_grad():
81
  waveform = model(x=waveform, y=condition_tensor)
82
 
83
  #Transform output audio into gradio-readable format
 
84
  waveform = waveform.numpy()
85
  waveform = float32_to_int16(waveform)
86
  return sample_rate, waveform