|
{ |
|
"encoder_name": "WavLM", |
|
"encoder_config": { |
|
"hidden_dims": [512, 512, 512, 512, 512, 512, 512], |
|
"kernel_sizes": [10, 3, 3, 3, 3, 2, 2], |
|
"strides": [5, 2, 2, 2, 2, 2, 2], |
|
"num_layers": 6, |
|
"dim": 1024, |
|
"ffn_dim": 4096, |
|
"num_heads": 16, |
|
"num_buckets": 320, |
|
"max_distance": 800, |
|
"dropout": 0.0, |
|
"conv_pos": 128, |
|
"conv_pos_groups": 16 |
|
}, |
|
"compressor_name": "FocalEncoder", |
|
"compressor_config": { |
|
"input_dim": 1024, |
|
"output_dim": 13, |
|
"hidden_dims": [1024, 512, 256], |
|
"downscale_factors": [2, 1, 1], |
|
"focal_window": 7, |
|
"focal_level": 2, |
|
"focal_factor": 2, |
|
"dropout": 0.0, |
|
"use_post_norm": false, |
|
"use_layerscale": false, |
|
"layerscale_init": 0.0001, |
|
"normalize_modulator": false |
|
}, |
|
"quantizer_name": "BinarySphericalQuantizer", |
|
"quantizer_config": { |
|
"codebook_size": 8192 |
|
}, |
|
"decompressor_name": "FocalDecoder", |
|
"decompressor_config": { |
|
"input_dim": 13, |
|
"output_dim": 1024, |
|
"hidden_dims": [256, 512, 1024], |
|
"upscale_factors": [1, 1, 2], |
|
"focal_window": 7, |
|
"focal_level": 2, |
|
"focal_factor": 2, |
|
"dropout": 0.0, |
|
"use_post_norm": false, |
|
"use_layerscale": false, |
|
"layerscale_init": 0.0001, |
|
"normalize_modulator": false |
|
}, |
|
"decoder_name": "Vocos", |
|
"decoder_config": { |
|
"input_channels": 1024, |
|
"num_layers": 8, |
|
"dim": 512, |
|
"ffn_dim": 1536, |
|
"kernel_size": 7, |
|
"padding": 3, |
|
"layerscale_init": null, |
|
"n_fft": 1024, |
|
"hop_length": 320 |
|
} |
|
} |
|
|