data: | |
sampling_rate: 32000 | |
segment_seconds: 10 | |
tokenizer_type: "HuggingFaceTB/SmolLM2-135M" | |
text_tokenization_len: 129 | |
model: | |
encoder: | |
audioenc_name: 'HTSAT' | |
transformer_embed_dim: 768 | |
out_emb: 768 | |
d_proj: 576 | |
decoder: | |
text_decoder: "HuggingFaceTB/SmolLM2-135M" | |
prefix_length: 389 | |
model_type: Mellow | |