{ "decoder": { "type": "istftnet", "upsample_kernel_sizes": [20, 12], "upsample_rates": [10, 6], "gen_istft_hop_size": 5, "gen_istft_n_fft": 20, "resblock_dilation_sizes": [ [1, 3, 5], [1, 3, 5], [1, 3, 5] ], "resblock_kernel_sizes": [3, 7, 11], "upsample_initial_channel": 512 }, "dim_in": 64, "dropout": 0.2, "hidden_dim": 512, "max_conv_dim": 512, "max_dur": 50, "multispeaker": true, "n_layer": 3, "n_mels": 80, "n_token": 178, "style_dim": 128, "audio_length": "infinity", "voice_enabled": "yes", "hidden_layers": -10, "frequency_bins": "many", "character_count": false, "output_shape": [128, "unknown"], "std": [ 0.5, 0.5, 0.5 ], "num_classes": 1000, "pool_size": null, "first_conv": "patch_embed.proj", "classifier": "head", "mean": [ 0.5, 0.5, 0.5 ] }