MOSS-TTSD / XY_Tokenizer /config /xy_tokenizer_config.yaml
yhzx233's picture
feat: app.py
ea174b0
input_sample_rate: &input_sample_rate 16000
output_sample_rate: &output_sample_rate 24000
generator_params:
input_sample_rate: *input_sample_rate
output_sample_rate: *output_sample_rate
feature_extractor_kwargs:
chunk_length: 30
feature_size: 80
hop_length: 160
n_fft: 400
n_samples: 480000
nb_max_frames: 3000
padding_side: right
padding_value: 0.0
return_attention_mask: false
sampling_rate: *input_sample_rate
## Codec Args
## semantic channel
semantic_encoder_kwargs: # 100hz -> 50hz
num_mel_bins: 80
sampling_rate: *input_sample_rate
hop_length: 160
stride_size: 2
kernel_size: 3
d_model: 768
scale_embedding: false
max_audio_seconds: 30
encoder_layers: 12
encoder_attention_heads: 12
encoder_ffn_dim: 3072
activation_function: "gelu"
semantic_encoder_adapter_kwargs: # 50hz
input_dim: 768
output_dim: 768
d_model: 768
max_source_positions: 1500
encoder_layers: 4
encoder_attention_heads: 12
encoder_ffn_dim: 3072
## acoustic channel
acoustic_encoder_kwargs: # 100hz -> 50hz
num_mel_bins: 80
sampling_rate: *input_sample_rate
hop_length: 160
stride_size: 2
kernel_size: 3
d_model: 768
scale_embedding: false
max_audio_seconds: 30
encoder_layers: 12
encoder_attention_heads: 12
encoder_ffn_dim: 3072
activation_function: "gelu"
## semantic & acoustic shared parameters
pre_rvq_adapter_kwargs: # 50hz
input_dim: 1536
output_dim: 768
d_model: 768
max_source_positions: 1500
encoder_layers: 4
encoder_attention_heads: 12
encoder_ffn_dim: 3072
downsample_kwargs: # 50hz -> 12.5hz
d_model: 768
avg_pooler: 4
quantizer_kwargs: # 12.5hz
input_dim: 3072
rvq_dim: 512
output_dim: 3072
num_quantizers: 8
codebook_size: 1024
codebook_dim: 512
quantizer_dropout: 0.0
commitment: 1
post_rvq_adapter_kwargs: # 12.5hz
input_dim: 3072
output_dim: 3072
d_model: 768
max_source_positions: 375
encoder_layers: 4
encoder_attention_heads: 12
encoder_ffn_dim: 3072
upsample_kwargs: # 12.5hz -> 50hz
d_model: 768
stride: 4
## acoustic channel
acoustic_decoder_kwargs: # 50hz -> 100hz
num_mel_bins: 80
sampling_rate: *input_sample_rate
hop_length: 160
stride_size: 2
kernel_size: 3
d_model: 768
scale_embedding: false
max_audio_seconds: 30
decoder_layers: 12
decoder_attention_heads: 12
decoder_ffn_dim: 3072
activation_function: "gelu"
vocos_kwargs: # 100hz -> 24khz
input_channels: 80
dim: 512
intermediate_dim: 4096
num_layers: 30
n_fft: 960
hop_size: 240
padding: "same"