Spaces:
Sleeping
Sleeping
hengjie yang
commited on
Commit
·
3544cbd
1
Parent(s):
51df8b3
Fix tensor dimension mismatch in speaker embedding
Browse files
src/deploy/voice_clone.py
CHANGED
@@ -71,10 +71,14 @@ class VoiceCloneSystem:
|
|
71 |
# 提取特征
|
72 |
with torch.no_grad():
|
73 |
embedding = self.speaker_encoder.encode_batch(waveform.to(self.device))
|
|
|
|
|
74 |
embeddings.append(embedding)
|
75 |
|
76 |
# 计算平均特征
|
77 |
mean_embedding = torch.mean(torch.stack(embeddings), dim=0)
|
|
|
|
|
78 |
return mean_embedding
|
79 |
|
80 |
def generate_speech(
|
|
|
71 |
# 提取特征
|
72 |
with torch.no_grad():
|
73 |
embedding = self.speaker_encoder.encode_batch(waveform.to(self.device))
|
74 |
+
# 调整维度
|
75 |
+
embedding = embedding.squeeze(0) # 移除批次维度
|
76 |
embeddings.append(embedding)
|
77 |
|
78 |
# 计算平均特征
|
79 |
mean_embedding = torch.mean(torch.stack(embeddings), dim=0)
|
80 |
+
# 调整维度以匹配模型要求
|
81 |
+
mean_embedding = mean_embedding.view(1, -1) # [1, 512]
|
82 |
return mean_embedding
|
83 |
|
84 |
def generate_speech(
|