hengjie yang commited on
Commit
3544cbd
·
1 Parent(s): 51df8b3

Fix tensor dimension mismatch in speaker embedding

Browse files
Files changed (1) hide show
  1. src/deploy/voice_clone.py +4 -0
src/deploy/voice_clone.py CHANGED
@@ -71,10 +71,14 @@ class VoiceCloneSystem:
71
  # 提取特征
72
  with torch.no_grad():
73
  embedding = self.speaker_encoder.encode_batch(waveform.to(self.device))
 
 
74
  embeddings.append(embedding)
75
 
76
  # 计算平均特征
77
  mean_embedding = torch.mean(torch.stack(embeddings), dim=0)
 
 
78
  return mean_embedding
79
 
80
  def generate_speech(
 
71
  # 提取特征
72
  with torch.no_grad():
73
  embedding = self.speaker_encoder.encode_batch(waveform.to(self.device))
74
+ # 调整维度
75
+ embedding = embedding.squeeze(0) # 移除批次维度
76
  embeddings.append(embedding)
77
 
78
  # 计算平均特征
79
  mean_embedding = torch.mean(torch.stack(embeddings), dim=0)
80
+ # 调整维度以匹配模型要求
81
+ mean_embedding = mean_embedding.view(1, -1) # [1, 512]
82
  return mean_embedding
83
 
84
  def generate_speech(