|
|
|
|
|
defaults:
|
|
- ../default
|
|
- override /dset: audio/default
|
|
- _self_
|
|
|
|
solver: compression
|
|
sample_rate: ???
|
|
channels: ???
|
|
|
|
|
|
losses:
|
|
adv: 4.
|
|
feat: 4.
|
|
l1: 0.1
|
|
mel: 0.
|
|
msspec: 2.
|
|
sisnr: 0.
|
|
balancer:
|
|
balance_grads: true
|
|
ema_decay: 0.999
|
|
per_batch_item: true
|
|
total_norm: 1.
|
|
|
|
adversarial:
|
|
every: 1
|
|
adversaries: [msstftd]
|
|
adv_loss: hinge
|
|
feat_loss: l1
|
|
|
|
|
|
l1: {}
|
|
l2: {}
|
|
mrstft:
|
|
factor_sc: .5
|
|
factor_mag: .5
|
|
normalized: false
|
|
mel:
|
|
sample_rate: ${sample_rate}
|
|
n_fft: 1024
|
|
hop_length: 256
|
|
win_length: 1024
|
|
n_mels: 64
|
|
f_min: 64
|
|
f_max: null
|
|
normalized: false
|
|
floor_level: 1e-5
|
|
sisnr:
|
|
sample_rate: ${sample_rate}
|
|
segment: 5.
|
|
msspec:
|
|
sample_rate: ${sample_rate}
|
|
range_start: 6
|
|
range_end: 11
|
|
n_mels: 64
|
|
f_min: 64
|
|
f_max: null
|
|
normalized: true
|
|
alphas: false
|
|
floor_level: 1e-5
|
|
|
|
|
|
metrics:
|
|
visqol:
|
|
mode: audio
|
|
bin: null
|
|
model: tcdaudio14_aacvopus_coresv_svrnsim_n.68_g.01_c1.model
|
|
|
|
|
|
msstftd:
|
|
in_channels: 1
|
|
out_channels: 1
|
|
filters: 32
|
|
norm: weight_norm
|
|
n_ffts: [1024, 2048, 512, 256, 128]
|
|
hop_lengths: [256, 512, 128, 64, 32]
|
|
win_lengths: [1024, 2048, 512, 256, 128]
|
|
activation: LeakyReLU
|
|
activation_params: {negative_slope: 0.3}
|
|
msd:
|
|
in_channels: 1
|
|
out_channels: 1
|
|
scale_norms: [spectral_norm, weight_norm, weight_norm]
|
|
kernel_sizes: [5, 3]
|
|
filters: 16
|
|
max_filters: 1024
|
|
downsample_scales: [4, 4, 4, 4]
|
|
inner_kernel_sizes: null
|
|
groups: [4, 4, 4, 4]
|
|
strides: null
|
|
paddings: null
|
|
activation: LeakyReLU
|
|
activation_params: {negative_slope: 0.3}
|
|
mpd:
|
|
in_channels: 1
|
|
out_channels: 1
|
|
periods: [2, 3, 5, 7, 11]
|
|
n_layers: 5
|
|
kernel_size: 5
|
|
stride: 3
|
|
filters: 8
|
|
filter_scales: 4
|
|
max_filters: 1024
|
|
activation: LeakyReLU
|
|
activation_params: {negative_slope: 0.3}
|
|
norm: weight_norm
|
|
|
|
|
|
dataset:
|
|
batch_size: 64
|
|
num_workers: 10
|
|
segment_duration: 1
|
|
train:
|
|
num_samples: 500000
|
|
valid:
|
|
num_samples: 10000
|
|
evaluate:
|
|
batch_size: 32
|
|
num_samples: 10000
|
|
generate:
|
|
batch_size: 32
|
|
num_samples: 50
|
|
segment_duration: 10
|
|
|
|
|
|
evaluate:
|
|
every: 25
|
|
num_workers: 5
|
|
metrics:
|
|
visqol: false
|
|
sisnr: true
|
|
generate:
|
|
every: 25
|
|
num_workers: 5
|
|
audio:
|
|
sample_rate: ${sample_rate}
|
|
|
|
|
|
checkpoint:
|
|
save_last: true
|
|
save_every: 25
|
|
keep_last: 10
|
|
keep_every_states: null
|
|
|
|
|
|
optim:
|
|
epochs: 200
|
|
updates_per_epoch: 2000
|
|
lr: 3e-4
|
|
max_norm: 0.
|
|
optimizer: adam
|
|
adam:
|
|
betas: [0.5, 0.9]
|
|
weight_decay: 0.
|
|
ema:
|
|
use: true
|
|
updates: 1
|
|
device: ${device}
|
|
decay: 0.99
|
|
|