Spaces:
Configuration error
Configuration error
use spk_embedding when sft
Browse files
cosyvoice/flow/flow.py
CHANGED
|
@@ -60,7 +60,7 @@ class MaskedDiffWithXvec(torch.nn.Module):
|
|
| 60 |
token_len = batch['speech_token_len'].to(device)
|
| 61 |
feat = batch['speech_feat'].to(device)
|
| 62 |
feat_len = batch['speech_feat_len'].to(device)
|
| 63 |
-
embedding = batch['
|
| 64 |
|
| 65 |
# xvec projection
|
| 66 |
embedding = F.normalize(embedding, dim=1)
|
|
|
|
| 60 |
token_len = batch['speech_token_len'].to(device)
|
| 61 |
feat = batch['speech_feat'].to(device)
|
| 62 |
feat_len = batch['speech_feat_len'].to(device)
|
| 63 |
+
embedding = batch['embedding'].to(device)
|
| 64 |
|
| 65 |
# xvec projection
|
| 66 |
embedding = F.normalize(embedding, dim=1)
|
cosyvoice/llm/llm.py
CHANGED
|
@@ -97,7 +97,7 @@ class TransformerLM(torch.nn.Module):
|
|
| 97 |
text_token_len = batch['text_token_len'].to(device)
|
| 98 |
speech_token = batch['speech_token'].to(device)
|
| 99 |
speech_token_len = batch['speech_token_len'].to(device)
|
| 100 |
-
embedding = batch['
|
| 101 |
|
| 102 |
# 1. prepare llm_target
|
| 103 |
lm_target = [torch.tensor([IGNORE_ID] * (2 + text_token_len[i]) + speech_token[i, :speech_token_len[i]].tolist() + [self.speech_token_size]) for i in range(text_token.size(0))]
|
|
|
|
| 97 |
text_token_len = batch['text_token_len'].to(device)
|
| 98 |
speech_token = batch['speech_token'].to(device)
|
| 99 |
speech_token_len = batch['speech_token_len'].to(device)
|
| 100 |
+
embedding = batch['embedding'].to(device)
|
| 101 |
|
| 102 |
# 1. prepare llm_target
|
| 103 |
lm_target = [torch.tensor([IGNORE_ID] * (2 + text_token_len[i]) + speech_token[i, :speech_token_len[i]].tolist() + [self.speech_token_size]) for i in range(text_token.size(0))]
|
cosyvoice/utils/executor.py
CHANGED
|
@@ -52,6 +52,10 @@ class Executor:
|
|
| 52 |
info_dict["batch_idx"] = batch_idx
|
| 53 |
if cosyvoice_join(group_join, info_dict):
|
| 54 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
# Disable gradient synchronizations across DDP processes.
|
| 57 |
# Within this context, gradients will be accumulated on module
|
|
|
|
| 52 |
info_dict["batch_idx"] = batch_idx
|
| 53 |
if cosyvoice_join(group_join, info_dict):
|
| 54 |
break
|
| 55 |
+
if info_dict["use_spk_embedding"] is True:
|
| 56 |
+
batch_dict["embedding"] = batch_dict["spk_embedding"]
|
| 57 |
+
else:
|
| 58 |
+
batch_dict["embedding"] = batch_dict["utt_embedding"]
|
| 59 |
|
| 60 |
# Disable gradient synchronizations across DDP processes.
|
| 61 |
# Within this context, gradients will be accumulated on module
|
examples/libritts/cosyvoice/conf/cosyvoice.fromscratch.yaml
CHANGED
|
@@ -190,6 +190,7 @@ train_conf:
|
|
| 190 |
scheduler: warmuplr
|
| 191 |
scheduler_conf:
|
| 192 |
warmup_steps: 25000
|
|
|
|
| 193 |
max_epoch: 200
|
| 194 |
grad_clip: 5
|
| 195 |
accum_grad: 2
|
|
|
|
| 190 |
scheduler: warmuplr
|
| 191 |
scheduler_conf:
|
| 192 |
warmup_steps: 25000
|
| 193 |
+
use_spk_embedding: False # change to True during sft
|
| 194 |
max_epoch: 200
|
| 195 |
grad_clip: 5
|
| 196 |
accum_grad: 2
|
examples/libritts/cosyvoice/conf/cosyvoice.yaml
CHANGED
|
@@ -190,6 +190,7 @@ train_conf:
|
|
| 190 |
scheduler: warmuplr # change to constantlr during sft
|
| 191 |
scheduler_conf:
|
| 192 |
warmup_steps: 2500
|
|
|
|
| 193 |
max_epoch: 200
|
| 194 |
grad_clip: 5
|
| 195 |
accum_grad: 2
|
|
|
|
| 190 |
scheduler: warmuplr # change to constantlr during sft
|
| 191 |
scheduler_conf:
|
| 192 |
warmup_steps: 2500
|
| 193 |
+
use_spk_embedding: False # change to True during sft
|
| 194 |
max_epoch: 200
|
| 195 |
grad_clip: 5
|
| 196 |
accum_grad: 2
|