Spaces:
Running
on
Zero
Running
on
Zero
| # This is an example that demonstrates how to configure a model file. | |
| # You can modify the configuration according to your own requirements. | |
| # to print the register_table: | |
| # from funasr.register import tables | |
| # tables.print() | |
| # network architecture | |
| model: Emotion2vec | |
| model_conf: | |
| loss_beta: 0.0 | |
| loss_scale: null | |
| depth: 8 | |
| start_drop_path_rate: 0.0 | |
| end_drop_path_rate: 0.0 | |
| num_heads: 12 | |
| norm_eps: 1e-05 | |
| norm_affine: true | |
| encoder_dropout: 0.1 | |
| post_mlp_drop: 0.1 | |
| attention_dropout: 0.1 | |
| activation_dropout: 0.0 | |
| dropout_input: 0.0 | |
| layerdrop: 0.05 | |
| embed_dim: 768 | |
| mlp_ratio: 4.0 | |
| layer_norm_first: false | |
| average_top_k_layers: 8 | |
| end_of_block_targets: false | |
| clone_batch: 8 | |
| layer_norm_target_layer: false | |
| batch_norm_target_layer: false | |
| instance_norm_target_layer: true | |
| instance_norm_targets: false | |
| layer_norm_targets: false | |
| ema_decay: 0.999 | |
| ema_same_dtype: true | |
| log_norms: true | |
| ema_end_decay: 0.99999 | |
| ema_anneal_end_step: 20000 | |
| ema_encoder_only: false | |
| max_update: 100000 | |
| extractor_mode: layer_norm | |
| shared_decoder: null | |
| min_target_var: 0.1 | |
| min_pred_var: 0.01 | |
| supported_modality: AUDIO | |
| mae_init: false | |
| seed: 1 | |
| skip_ema: false | |
| cls_loss: 1.0 | |
| recon_loss: 0.0 | |
| d2v_loss: 1.0 | |
| decoder_group: false | |
| adversarial_training: false | |
| adversarial_hidden_dim: 128 | |
| adversarial_weight: 0.1 | |
| cls_type: chunk | |
| normalize: true | |
| modalities: | |
| audio: | |
| type: AUDIO | |
| prenet_depth: 4 | |
| prenet_layerdrop: 0.05 | |
| prenet_dropout: 0.1 | |
| start_drop_path_rate: 0.0 | |
| end_drop_path_rate: 0.0 | |
| num_extra_tokens: 10 | |
| init_extra_token_zero: true | |
| mask_noise_std: 0.01 | |
| mask_prob_min: null | |
| mask_prob: 0.5 | |
| inverse_mask: false | |
| mask_prob_adjust: 0.05 | |
| keep_masked_pct: 0.0 | |
| mask_length: 5 | |
| add_masks: false | |
| remove_masks: false | |
| mask_dropout: 0.0 | |
| encoder_zero_mask: true | |
| mask_channel_prob: 0.0 | |
| mask_channel_length: 64 | |
| ema_local_encoder: false | |
| local_grad_mult: 1.0 | |
| use_alibi_encoder: true | |
| alibi_scale: 1.0 | |
| learned_alibi: false | |
| alibi_max_pos: null | |
| learned_alibi_scale: true | |
| learned_alibi_scale_per_head: true | |
| learned_alibi_scale_per_layer: false | |
| num_alibi_heads: 12 | |
| model_depth: 8 | |
| decoder: | |
| decoder_dim: 384 | |
| decoder_groups: 16 | |
| decoder_kernel: 7 | |
| decoder_layers: 4 | |
| input_dropout: 0.1 | |
| add_positions_masked: false | |
| add_positions_all: false | |
| decoder_residual: true | |
| projection_layers: 1 | |
| projection_ratio: 2.0 | |
| extractor_mode: layer_norm | |
| feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]' | |
| conv_pos_width: 95 | |
| conv_pos_groups: 16 | |
| conv_pos_depth: 5 | |
| conv_pos_pre_ln: false | |