File size: 3,835 Bytes
994e8f9 234d8a4 994e8f9 234d8a4 994e8f9 2942116 994e8f9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
# ############################################################################
# Model: E2E ASR with Transformer
# Encoder: Conformer Encoder
# Decoder: Transformer Decoder + (CTC/att joint)
# Tokens: bpe
# losses: CTC + KLdiv (Label Smoothing loss)
# Authors: Titouan Parcollet
# ############################################################################
# Feature parameters
sample_rate: 16000
n_fft: 400
n_mels: 80
####################### Model Parameters ###########################
# Transformer
d_model: 1024
nhead: 16
num_encoder_layers: 18
num_decoder_layers: 6
d_ffn: 3072
transformer_dropout: 0.1
activation: !name:torch.nn.GELU
output_neurons: 5120
# Outputs
blank_index: 3
label_smoothing: 0.1
pad_index: 0
bos_index: 1
eos_index: 2
# Decoding parameters
min_decode_ratio: 0.0
max_decode_ratio: 1.0
test_beam_size: 10
ctc_weight_decode: 0.3
scorer_beam_scale: 0.3
transformer_beam_search: True
############################## models ################################
tokenizer: !new:sentencepiece.SentencePieceProcessor
normalizer: !new:speechbrain.processing.features.InputNormalization
norm_type: global
compute_features: !new:speechbrain.lobes.features.Fbank
sample_rate: !ref <sample_rate>
n_fft: !ref <n_fft>
n_mels: !ref <n_mels>
CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
input_shape: (8, 10, 80)
num_blocks: 2
num_layers_per_block: 1
out_channels: (64, 32)
kernel_sizes: (3, 3)
strides: (2, 2)
residuals: (False, False)
Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
input_size: 640
tgt_vocab: !ref <output_neurons>
d_model: !ref <d_model>
nhead: !ref <nhead>
num_encoder_layers: !ref <num_encoder_layers>
num_decoder_layers: !ref <num_decoder_layers>
d_ffn: !ref <d_ffn>
dropout: !ref <transformer_dropout>
conformer_activation: !ref <activation>
activation: !ref <activation>
encoder_module: conformer
attention_type: RelPosMHAXL
normalize_before: True
causal: False
ctc_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <d_model>
n_neurons: !ref <output_neurons>
seq_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <d_model>
n_neurons: !ref <output_neurons>
asr_model: !new:torch.nn.ModuleList
- [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
# Scorer
ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer
eos_index: !ref <eos_index>
blank_index: !ref <blank_index>
ctc_fc: !ref <ctc_lin>
scorer: !new:speechbrain.decoders.scorer.ScorerBuilder
full_scorers: [!ref <ctc_scorer>]
weights:
ctc: !ref <ctc_weight_decode>
scorer_beam_scale: !ref <scorer_beam_scale>
# We compose the inference (encoder) pipeline.
encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
input_shape: [null, null, !ref <n_mels>]
compute_features: !ref <compute_features>
normalize: !ref <normalizer>
CNN: !ref <CNN>
decoder: !new:speechbrain.decoders.S2STransformerBeamSearcher
modules: [!ref <Transformer>, !ref <seq_lin>]
bos_index: !ref <bos_index>
eos_index: !ref <eos_index>
min_decode_ratio: !ref <min_decode_ratio>
max_decode_ratio: !ref <max_decode_ratio>
beam_size: !ref <test_beam_size>
temperature: 1.15
using_eos_threshold: True
scorer: !ref <scorer>
modules:
normalizer: !ref <normalizer>
encoder: !ref <encoder>
transformer: !ref <Transformer>
decoder: !ref <decoder>
ctc_lin: !ref <ctc_lin>
log_softmax: !new:torch.nn.LogSoftmax
dim: -1
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
normalizer: !ref <normalizer>
asr: !ref <asr_model>
tokenizer: !ref <tokenizer> |