from easydict import EasyDict as dict D_MODEL = 768 HIDDEN_SIZE = 512 context_encoder = dict( feature_projection=dict( in_features=HIDDEN_SIZE, out_features=D_MODEL, dropout=0.1, ), encoder=dict( d_model=D_MODEL, num_layers=12, layer_drop=0.05, pos_embedding=dict( d_model=D_MODEL, kernel_size=3, groups=2, dropout=0.1, ), layer=dict( d_model=D_MODEL, num_heads=8, layer_norm_first=False, feed_forward_dim=2048, dropout=0.1, ), ) ) feature_extractor = dict( num_channels=7 * (HIDDEN_SIZE,), kernel_sizes=(10,) + 4 * (3,) + 2 * (2,), strides=(5,) + 6 * (2,), ) quantizer = dict( in_features=HIDDEN_SIZE, num_codebooks=2, num_codewords=320, d_model=D_MODEL, ) wav2vec2_pretraining = dict( context_encoder=context_encoder, feature_extractor=feature_extractor, quantizer=quantizer, mask_prob=0.65, mask_length=10, min_masks=2, num_negatives=100, contrastive_logits_temperature=0.1, diversity_loss_weight=0.2, )