Spaces:

FunAudioLLM
/

InspireMusic

Running on Zero

App Files Files Community

chong.zhang commited on Feb 7

Commit

3460b5c

1 Parent(s): 16850a3

update

Browse files

Files changed (6) hide show

app.py +1 -0
example/conf/InspireMusic-1.5B-24kHz.yaml +171 -0
example/conf/InspireMusic-1.5B-Long.yaml +171 -0
example/conf/InspireMusic-1.5B.yaml +171 -0
example/conf/InspireMusic-Base-24kHz.yaml +171 -0
example/conf/InspireMusic-Base.yaml +180 -0

app.py CHANGED Viewed

@@ -25,6 +25,7 @@ import sys
 os.system('nvidia-smi')
 os.system('apt update -y && apt-get install -y apt-utils && apt install -y unzip')
 # os.system('pip install flash-attn --no-build-isolation')
 # os.system('git submodule update --init --recursive')
 # os.system('git clone https://github.com/shivammehta25/Matcha-TTS.git third_party/')

 os.system('nvidia-smi')
 os.system('apt update -y && apt-get install -y apt-utils && apt install -y unzip')
+os.environ['PYTHONPATH'] = 'third_party/Matcha-TTS'
 # os.system('pip install flash-attn --no-build-isolation')
 # os.system('git submodule update --init --recursive')
 # os.system('git clone https://github.com/shivammehta25/Matcha-TTS.git third_party/')

example/conf/InspireMusic-1.5B-24kHz.yaml ADDED Viewed

	@@ -0,0 +1,171 @@

+# set random seed, so that you may reproduce your result.
+__set_seed1: !apply:random.seed [1024]
+__set_seed2: !apply:numpy.random.seed [1024]
+__set_seed3: !apply:torch.manual_seed [1024]
+__set_seed4: !apply:torch.cuda.manual_seed_all [1024]
+# fixed params
+sample_rate: 24000
+text_encoder_input_size: 512
+llm_input_size: 1536
+llm_output_size: 1536
+basemodel_path: 'pretrained_models/InspireMusic-1.5B-24kHz/'
+generator_path: 'pretrained_models/InspireMusic-1.5B-24kHz/music_tokenizer'
+# model params
+# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
+# for system/third_party class/function, we do not require this.
+llm: !new:inspiremusic.llm.llm.LLM
+    text_encoder_input_size: !ref <text_encoder_input_size>
+    llm_input_size: !ref <llm_input_size>
+    llm_output_size: !ref <llm_output_size>
+    audio_token_size: 4096
+    length_normalized_loss: True
+    lsm_weight: 0
+    text_encoder_conf:
+        name: "none"
+    llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
+        input_size: !ref <text_encoder_input_size>
+        pretrain_path: !ref <basemodel_path>
+    sampling: !name:inspiremusic.utils.common.topk_sampling
+        top_k: 350
+    train_cfg_ratio: 0.2
+    infer_cfg_ratio: 3.0
+flow: !new:inspiremusic.flow.flow.MaskedDiff
+    input_size: 256
+    output_size: 80
+    output_type: 'mel'
+    vocab_size: 4096
+    input_frame_rate: 75
+    only_mask_loss: True
+    encoder: !new:inspiremusic.transformer.encoder.ConformerEncoder
+        output_size: 512
+        attention_heads: 4
+        linear_units: 1024
+        num_blocks: 3
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.1
+        normalize_before: True
+        input_layer: 'linear'
+        pos_enc_layer_type: 'rel_pos_espnet'
+        selfattention_layer_type: 'rel_selfattn'
+        input_size: 256
+        use_cnn_module: False
+        macaron_style: False
+    length_regulator: !new:inspiremusic.flow.length_regulator.InterpolateRegulator
+        channels: 512
+        sampling_ratios: [1, 1, 1, 1]
+    decoder: !new:inspiremusic.flow.flow_matching.ConditionalCFM
+        in_channels: 240
+        cfm_params: !new:omegaconf.DictConfig
+            content:
+                sigma_min: 1e-06
+                solver: 'euler'
+                t_scheduler: 'cosine'
+                training_cfg_rate: 0.2
+                inference_cfg_rate: 0.7
+                reg_loss_type: 'l1'
+        estimator: !new:inspiremusic.flow.decoder.ConditionalDecoder
+            in_channels: 1024
+            out_channels: 512
+            channels: [256, 256]
+            dropout: 0.0
+            attention_head_dim: 64
+            n_blocks: 4
+            num_mid_blocks: 8
+            num_heads: 8
+            act_fn: 'gelu'
+    generator_model_dir: !ref <generator_path>
+hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
+    in_channels: 80
+    base_channels: 512
+    nb_harmonics: 8
+    sampling_rate: !ref <sample_rate>
+    nsf_alpha: 0.1
+    nsf_sigma: 0.003
+    nsf_voiced_threshold: 10
+    upsample_rates: [8, 8]
+    upsample_kernel_sizes: [16, 16]
+    istft_params:
+        n_fft: 16
+        hop_len: 4
+    resblock_kernel_sizes: [3, 7, 11]
+    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+    source_resblock_kernel_sizes: [7, 11]
+    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
+    lrelu_slope: 0.1
+    audio_limit: 0.99
+    f0_predictor: !new:inspiremusic.hifigan.f0_predictor.ConvRNNF0Predictor
+        num_class: 1
+        in_channels: 80
+        cond_channels: 512
+wavtokenizer: !new:inspiremusic.hifigan.generator.HiFTGenerator
+# processor functions
+parquet_opener: !name:inspiremusic.dataset.processor.parquet_opener
+get_tokenizer: !name:inspiremusic.text.tokenizer.get_tokenizer
+    tokenizer_path: !ref <basemodel_path>
+    tokenizer_name: "qwen-2.5"
+allowed_special: 'all'
+tokenize: !name:inspiremusic.dataset.processor.tokenize
+    get_tokenizer: !ref <get_tokenizer>
+    allowed_special: !ref <allowed_special>
+filter: !name:inspiremusic.dataset.processor.filter
+    max_length: 28000
+    min_length: 0
+    token_max_length: 200
+    token_min_length: 1
+resample: !name:inspiremusic.dataset.processor.resample
+    resample_rate: !ref <sample_rate>
+feat_extractor: !name:matcha.utils.audio.mel_spectrogram
+    n_fft: 1024
+    num_mels: 128
+    sampling_rate: !ref <sample_rate>
+    hop_size: 256
+    win_size: 1024
+    fmin: 0
+    fmax: 24000
+    center: False
+compute_fbank: !name:inspiremusic.dataset.processor.compute_fbank
+    feat_extractor: !ref <feat_extractor>
+parse_embedding: !name:inspiremusic.dataset.processor.parse_embedding
+    normalize: True
+shuffle: !name:inspiremusic.dataset.processor.shuffle
+    shuffle_size: 1000
+sort: !name:inspiremusic.dataset.processor.sort
+    sort_size: 500  # sort_size should be less than shuffle_size
+batch: !name:inspiremusic.dataset.processor.batch
+    batch_type: 'dynamic'
+    max_frames_in_batch: 10000 # llm 12000
+padding: !name:inspiremusic.dataset.processor.padding
+# dataset processor pipeline
+data_pipeline: [
+    !ref <parquet_opener>,
+    !ref <tokenize>,
+    !ref <shuffle>,
+    !ref <sort>,
+    !ref <filter>,
+    !ref <batch>,
+    !ref <padding>,
+]
+# train conf
+train_conf:
+    optim: adam
+    optim_conf:
+        lr: 0.0001 # change to 0.001 if you want to train flow from scratch
+    scheduler: warmuplr
+    scheduler_conf:
+        warmup_steps: 5000
+    max_epoch: 200
+    grad_clip: 5
+    accum_grad: 2
+    log_interval: 100
+    save_per_step: 500

example/conf/InspireMusic-1.5B-Long.yaml ADDED Viewed

	@@ -0,0 +1,171 @@

+# set random seed, so that you may reproduce your result.
+__set_seed1: !apply:random.seed [1988]
+__set_seed2: !apply:numpy.random.seed [1988]
+__set_seed3: !apply:torch.manual_seed [1988]
+__set_seed4: !apply:torch.cuda.manual_seed_all [1988]
+# fixed params
+sample_rate: 24000
+text_encoder_input_size: 512
+llm_input_size: 1536
+llm_output_size: 1536
+basemodel_path: 'pretrained_models/InspireMusic-1.5B-Long/'
+generator_path: 'pretrained_models/InspireMusic-1.5B-Long/music_tokenizer'
+# model params
+# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
+# for system/third_party class/function, we do not require this.
+llm: !new:inspiremusic.llm.llm.LLM
+    text_encoder_input_size: !ref <text_encoder_input_size>
+    llm_input_size: !ref <llm_input_size>
+    llm_output_size: !ref <llm_output_size>
+    audio_token_size: 4096
+    length_normalized_loss: True
+    lsm_weight: 0
+    text_encoder_conf:
+        name: "none"
+    llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
+        input_size: !ref <text_encoder_input_size>
+        pretrain_path: !ref <basemodel_path>
+    sampling: !name:inspiremusic.utils.common.topk_sampling
+        top_k: 350
+    train_cfg_ratio: 0.2
+    infer_cfg_ratio: 3.0
+flow: !new:inspiremusic.flow.flow.MaskedDiff
+    input_size: 256
+    output_size: 80
+    output_type: 'mel'
+    vocab_size: 4096
+    input_frame_rate: 75
+    only_mask_loss: True
+    encoder: !new:inspiremusic.transformer.encoder.ConformerEncoder
+        output_size: 512
+        attention_heads: 4
+        linear_units: 1024
+        num_blocks: 3
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.1
+        normalize_before: True
+        input_layer: 'linear'
+        pos_enc_layer_type: 'rel_pos_espnet'
+        selfattention_layer_type: 'rel_selfattn'
+        input_size: 256
+        use_cnn_module: False
+        macaron_style: False
+    length_regulator: !new:inspiremusic.flow.length_regulator.InterpolateRegulator
+        channels: 512
+        sampling_ratios: [1, 1, 1, 1]
+    decoder: !new:inspiremusic.flow.flow_matching.ConditionalCFM
+        in_channels: 240
+        cfm_params: !new:omegaconf.DictConfig
+            content:
+                sigma_min: 1e-06
+                solver: 'euler'
+                t_scheduler: 'cosine'
+                training_cfg_rate: 0.2
+                inference_cfg_rate: 0.7
+                reg_loss_type: 'l1'
+        estimator: !new:inspiremusic.flow.decoder.ConditionalDecoder
+            in_channels: 1024
+            out_channels: 512
+            channels: [256, 256]
+            dropout: 0.0
+            attention_head_dim: 64
+            n_blocks: 4
+            num_mid_blocks: 8
+            num_heads: 8
+            act_fn: 'gelu'
+    generator_model_dir: !ref <generator_path>
+hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
+    in_channels: 80
+    base_channels: 512
+    nb_harmonics: 8
+    sampling_rate: !ref <sample_rate>
+    nsf_alpha: 0.1
+    nsf_sigma: 0.003
+    nsf_voiced_threshold: 10
+    upsample_rates: [8, 8]
+    upsample_kernel_sizes: [16, 16]
+    istft_params:
+        n_fft: 16
+        hop_len: 4
+    resblock_kernel_sizes: [3, 7, 11]
+    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+    source_resblock_kernel_sizes: [7, 11]
+    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
+    lrelu_slope: 0.1
+    audio_limit: 0.99
+    f0_predictor: !new:inspiremusic.hifigan.f0_predictor.ConvRNNF0Predictor
+        num_class: 1
+        in_channels: 80
+        cond_channels: 512
+wavtokenizer: !new:inspiremusic.hifigan.generator.HiFTGenerator
+# processor functions
+parquet_opener: !name:inspiremusic.dataset.processor.parquet_opener
+get_tokenizer: !name:inspiremusic.text.tokenizer.get_tokenizer
+    tokenizer_path: !ref <basemodel_path>
+    tokenizer_name: "qwen-2.5"
+allowed_special: 'all'
+tokenize: !name:inspiremusic.dataset.processor.tokenize
+    get_tokenizer: !ref <get_tokenizer>
+    allowed_special: !ref <allowed_special>
+filter: !name:inspiremusic.dataset.processor.filter
+    max_length: 28000
+    min_length: 0
+    token_max_length: 200
+    token_min_length: 1
+resample: !name:inspiremusic.dataset.processor.resample
+    resample_rate: !ref <sample_rate>
+feat_extractor: !name:matcha.utils.audio.mel_spectrogram
+    n_fft: 1024
+    num_mels: 128
+    sampling_rate: !ref <sample_rate>
+    hop_size: 256
+    win_size: 1024
+    fmin: 0
+    fmax: 24000
+    center: False
+compute_fbank: !name:inspiremusic.dataset.processor.compute_fbank
+    feat_extractor: !ref <feat_extractor>
+parse_embedding: !name:inspiremusic.dataset.processor.parse_embedding
+    normalize: True
+shuffle: !name:inspiremusic.dataset.processor.shuffle
+    shuffle_size: 1000
+sort: !name:inspiremusic.dataset.processor.sort
+    sort_size: 500  # sort_size should be less than shuffle_size
+batch: !name:inspiremusic.dataset.processor.batch
+    batch_type: 'dynamic'
+    max_frames_in_batch: 10000 # llm 12000
+padding: !name:inspiremusic.dataset.processor.padding
+# dataset processor pipeline
+data_pipeline: [
+    !ref <parquet_opener>,
+    !ref <tokenize>,
+    !ref <shuffle>,
+    !ref <sort>,
+    !ref <filter>,
+    !ref <batch>,
+    !ref <padding>,
+]
+# train conf
+train_conf:
+    optim: adam
+    optim_conf:
+        lr: 0.0001 # change to 0.001 if you want to train flow from scratch
+    scheduler: warmuplr
+    scheduler_conf:
+        warmup_steps: 5000
+    max_epoch: 200
+    grad_clip: 5
+    accum_grad: 2
+    log_interval: 100
+    save_per_step: 500

example/conf/InspireMusic-1.5B.yaml ADDED Viewed

	@@ -0,0 +1,171 @@

+# set random seed, so that you may reproduce your result.
+__set_seed1: !apply:random.seed [1988]
+__set_seed2: !apply:numpy.random.seed [1988]
+__set_seed3: !apply:torch.manual_seed [1988]
+__set_seed4: !apply:torch.cuda.manual_seed_all [1988]
+# fixed params
+sample_rate: 24000
+text_encoder_input_size: 512
+llm_input_size: 1536
+llm_output_size: 1536
+basemodel_path: 'pretrained_models/InspireMusic-1.5B/'
+generator_path: 'pretrained_models/InspireMusic-1.5B/music_tokenizer'
+# model params
+# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
+# for system/third_party class/function, we do not require this.
+llm: !new:inspiremusic.llm.llm.LLM
+    text_encoder_input_size: !ref <text_encoder_input_size>
+    llm_input_size: !ref <llm_input_size>
+    llm_output_size: !ref <llm_output_size>
+    audio_token_size: 4096
+    length_normalized_loss: True
+    lsm_weight: 0
+    text_encoder_conf:
+        name: "none"
+    llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
+        input_size: !ref <text_encoder_input_size>
+        pretrain_path: !ref <basemodel_path>
+    sampling: !name:inspiremusic.utils.common.topk_sampling
+        top_k: 350
+    train_cfg_ratio: 0.2
+    infer_cfg_ratio: 3.0
+flow: !new:inspiremusic.flow.flow.MaskedDiff
+    input_size: 256
+    output_size: 80
+    output_type: 'mel'
+    vocab_size: 4096
+    input_frame_rate: 75
+    only_mask_loss: True
+    encoder: !new:inspiremusic.transformer.encoder.ConformerEncoder
+        output_size: 512
+        attention_heads: 4
+        linear_units: 1024
+        num_blocks: 3
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.1
+        normalize_before: True
+        input_layer: 'linear'
+        pos_enc_layer_type: 'rel_pos_espnet'
+        selfattention_layer_type: 'rel_selfattn'
+        input_size: 256
+        use_cnn_module: False
+        macaron_style: False
+    length_regulator: !new:inspiremusic.flow.length_regulator.InterpolateRegulator
+        channels: 512
+        sampling_ratios: [1, 1, 1, 1]
+    decoder: !new:inspiremusic.flow.flow_matching.ConditionalCFM
+        in_channels: 240
+        cfm_params: !new:omegaconf.DictConfig
+            content:
+                sigma_min: 1e-06
+                solver: 'euler'
+                t_scheduler: 'cosine'
+                training_cfg_rate: 0.2
+                inference_cfg_rate: 0.7
+                reg_loss_type: 'l1'
+        estimator: !new:inspiremusic.flow.decoder.ConditionalDecoder
+            in_channels: 1024
+            out_channels: 512
+            channels: [256, 256]
+            dropout: 0.0
+            attention_head_dim: 64
+            n_blocks: 4
+            num_mid_blocks: 8
+            num_heads: 8
+            act_fn: 'gelu'
+    generator_model_dir: !ref <generator_path>
+hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
+    in_channels: 80
+    base_channels: 512
+    nb_harmonics: 8
+    sampling_rate: !ref <sample_rate>
+    nsf_alpha: 0.1
+    nsf_sigma: 0.003
+    nsf_voiced_threshold: 10
+    upsample_rates: [8, 8]
+    upsample_kernel_sizes: [16, 16]
+    istft_params:
+        n_fft: 16
+        hop_len: 4
+    resblock_kernel_sizes: [3, 7, 11]
+    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+    source_resblock_kernel_sizes: [7, 11]
+    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
+    lrelu_slope: 0.1
+    audio_limit: 0.99
+    f0_predictor: !new:inspiremusic.hifigan.f0_predictor.ConvRNNF0Predictor
+        num_class: 1
+        in_channels: 80
+        cond_channels: 512
+wavtokenizer: !new:inspiremusic.hifigan.generator.HiFTGenerator
+# processor functions
+parquet_opener: !name:inspiremusic.dataset.processor.parquet_opener
+get_tokenizer: !name:inspiremusic.text.tokenizer.get_tokenizer
+    tokenizer_path: !ref <basemodel_path>
+    tokenizer_name: "qwen-2.5"
+allowed_special: 'all'
+tokenize: !name:inspiremusic.dataset.processor.tokenize
+    get_tokenizer: !ref <get_tokenizer>
+    allowed_special: !ref <allowed_special>
+filter: !name:inspiremusic.dataset.processor.filter
+    max_length: 28000
+    min_length: 0
+    token_max_length: 200
+    token_min_length: 1
+resample: !name:inspiremusic.dataset.processor.resample
+    resample_rate: !ref <sample_rate>
+feat_extractor: !name:matcha.utils.audio.mel_spectrogram
+    n_fft: 1024
+    num_mels: 128
+    sampling_rate: !ref <sample_rate>
+    hop_size: 256
+    win_size: 1024
+    fmin: 0
+    fmax: 24000
+    center: False
+compute_fbank: !name:inspiremusic.dataset.processor.compute_fbank
+    feat_extractor: !ref <feat_extractor>
+parse_embedding: !name:inspiremusic.dataset.processor.parse_embedding
+    normalize: True
+shuffle: !name:inspiremusic.dataset.processor.shuffle
+    shuffle_size: 1000
+sort: !name:inspiremusic.dataset.processor.sort
+    sort_size: 500  # sort_size should be less than shuffle_size
+batch: !name:inspiremusic.dataset.processor.batch
+    batch_type: 'dynamic'
+    max_frames_in_batch: 10000 # llm 12000
+padding: !name:inspiremusic.dataset.processor.padding
+# dataset processor pipeline
+data_pipeline: [
+    !ref <parquet_opener>,
+    !ref <tokenize>,
+    !ref <shuffle>,
+    !ref <sort>,
+    !ref <filter>,
+    !ref <batch>,
+    !ref <padding>,
+]
+# train conf
+train_conf:
+    optim: adam
+    optim_conf:
+        lr: 0.0001 # change to 0.001 if you want to train flow from scratch
+    scheduler: warmuplr
+    scheduler_conf:
+        warmup_steps: 5000
+    max_epoch: 200
+    grad_clip: 5
+    accum_grad: 2
+    log_interval: 100
+    save_per_step: 500

example/conf/InspireMusic-Base-24kHz.yaml ADDED Viewed

	@@ -0,0 +1,171 @@

+# set random seed, so that you may reproduce your result.
+__set_seed1: !apply:random.seed [1024]
+__set_seed2: !apply:numpy.random.seed [1024]
+__set_seed3: !apply:torch.manual_seed [1024]
+__set_seed4: !apply:torch.cuda.manual_seed_all [1024]
+# fixed params
+sample_rate: 24000
+text_encoder_input_size: 512
+llm_input_size: 896
+llm_output_size: 896
+basemodel_path: 'pretrained_models/InspireMusic-Base-24kHz/'
+generator_path: 'pretrained_models/InspireMusic-Base-24kHz/music_tokenizer'
+# model params
+# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
+# for system/third_party class/function, we do not require this.
+llm: !new:inspiremusic.llm.llm.LLM
+    text_encoder_input_size: !ref <text_encoder_input_size>
+    llm_input_size: !ref <llm_input_size>
+    llm_output_size: !ref <llm_output_size>
+    audio_token_size: 4096
+    length_normalized_loss: True
+    lsm_weight: 0
+    text_encoder_conf:
+        name: "none"
+    llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
+        input_size: !ref <text_encoder_input_size>
+        pretrain_path: !ref <basemodel_path>
+    sampling: !name:inspiremusic.utils.common.topk_sampling
+        top_k: 350
+    train_cfg_ratio: 0.2
+    infer_cfg_ratio: 7.0
+flow: !new:inspiremusic.flow.flow.MaskedDiff
+    input_size: 256
+    output_size: 80
+    output_type: 'mel'
+    vocab_size: 4096
+    input_frame_rate: 75
+    only_mask_loss: True
+    encoder: !new:inspiremusic.transformer.encoder.ConformerEncoder
+        output_size: 512
+        attention_heads: 4
+        linear_units: 1024
+        num_blocks: 3
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.1
+        normalize_before: True
+        input_layer: 'linear'
+        pos_enc_layer_type: 'rel_pos_espnet'
+        selfattention_layer_type: 'rel_selfattn'
+        input_size: 256
+        use_cnn_module: False
+        macaron_style: False
+    length_regulator: !new:inspiremusic.flow.length_regulator.InterpolateRegulator
+        channels: 512
+        sampling_ratios: [1, 1, 1, 1]
+    decoder: !new:inspiremusic.flow.flow_matching.ConditionalCFM
+        in_channels: 240
+        cfm_params: !new:omegaconf.DictConfig
+            content:
+                sigma_min: 1e-06
+                solver: 'euler'
+                t_scheduler: 'cosine'
+                training_cfg_rate: 0.2
+                inference_cfg_rate: 0.7
+                reg_loss_type: 'l1'
+        estimator: !new:inspiremusic.flow.decoder.ConditionalDecoder
+            in_channels: 1024
+            out_channels: 512
+            channels: [256, 256]
+            dropout: 0.0
+            attention_head_dim: 64
+            n_blocks: 4
+            num_mid_blocks: 8
+            num_heads: 8
+            act_fn: 'gelu'
+    generator_model_dir: !ref <generator_path>
+hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
+    in_channels: 80
+    base_channels: 512
+    nb_harmonics: 8
+    sampling_rate: !ref <sample_rate>
+    nsf_alpha: 0.1
+    nsf_sigma: 0.003
+    nsf_voiced_threshold: 10
+    upsample_rates: [8, 8]
+    upsample_kernel_sizes: [16, 16]
+    istft_params:
+        n_fft: 16
+        hop_len: 4
+    resblock_kernel_sizes: [3, 7, 11]
+    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+    source_resblock_kernel_sizes: [7, 11]
+    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
+    lrelu_slope: 0.1
+    audio_limit: 0.99
+    f0_predictor: !new:inspiremusic.hifigan.f0_predictor.ConvRNNF0Predictor
+        num_class: 1
+        in_channels: 80
+        cond_channels: 512
+wavtokenizer: !new:inspiremusic.hifigan.generator.HiFTGenerator
+# processor functions
+parquet_opener: !name:inspiremusic.dataset.processor.parquet_opener
+get_tokenizer: !name:inspiremusic.text.tokenizer.get_tokenizer
+    tokenizer_path: !ref <basemodel_path>
+    tokenizer_name: "qwen-2.0"
+allowed_special: 'all'
+tokenize: !name:inspiremusic.dataset.processor.tokenize
+    get_tokenizer: !ref <get_tokenizer>
+    allowed_special: !ref <allowed_special>
+filter: !name:inspiremusic.dataset.processor.filter
+    max_length: 28000
+    min_length: 0
+    token_max_length: 200
+    token_min_length: 1
+resample: !name:inspiremusic.dataset.processor.resample
+    resample_rate: !ref <sample_rate>
+feat_extractor: !name:matcha.utils.audio.mel_spectrogram
+    n_fft: 1024
+    num_mels: 128
+    sampling_rate: !ref <sample_rate>
+    hop_size: 256
+    win_size: 1024
+    fmin: 0
+    fmax: 24000
+    center: False
+compute_fbank: !name:inspiremusic.dataset.processor.compute_fbank
+    feat_extractor: !ref <feat_extractor>
+parse_embedding: !name:inspiremusic.dataset.processor.parse_embedding
+    normalize: True
+shuffle: !name:inspiremusic.dataset.processor.shuffle
+    shuffle_size: 1000
+sort: !name:inspiremusic.dataset.processor.sort
+    sort_size: 500  # sort_size should be less than shuffle_size
+batch: !name:inspiremusic.dataset.processor.batch
+    batch_type: 'dynamic'
+    max_frames_in_batch: 10000 # llm 12000
+padding: !name:inspiremusic.dataset.processor.padding
+# dataset processor pipeline
+data_pipeline: [
+    !ref <parquet_opener>,
+    !ref <tokenize>,
+    !ref <shuffle>,
+    !ref <sort>,
+    !ref <filter>,
+    !ref <batch>,
+    !ref <padding>,
+]
+# train conf
+train_conf:
+    optim: adam
+    optim_conf:
+        lr: 0.0001 # change to 0.001 if you want to train flow from scratch
+    scheduler: warmuplr
+    scheduler_conf:
+        warmup_steps: 5000
+    max_epoch: 200
+    grad_clip: 5
+    accum_grad: 2
+    log_interval: 100
+    save_per_step: 500

example/conf/InspireMusic-Base.yaml ADDED Viewed

	@@ -0,0 +1,180 @@

+# set random seed, so that you may reproduce your result.
+__set_seed1: !apply:random.seed [1024]
+__set_seed2: !apply:numpy.random.seed [1024]
+__set_seed3: !apply:torch.manual_seed [1024]
+__set_seed4: !apply:torch.cuda.manual_seed_all [1024]
+# fixed params
+sample_rate: 24000
+target_sample_rate: 48000
+text_encoder_input_size: 512
+llm_input_size: 896
+llm_output_size: 896
+basemodel_path: 'pretrained_models/InspireMusic-Base/'
+generator_path: 'pretrained_models/InspireMusic-Base/music_tokenizer'
+# model params
+# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
+# for system/third_party class/function, we do not require this.
+llm: !new:inspiremusic.llm.llm.LLM
+    text_encoder_input_size: !ref <text_encoder_input_size>
+    llm_input_size: !ref <llm_input_size>
+    llm_output_size: !ref <llm_output_size>
+    audio_token_size: 4096
+    length_normalized_loss: True
+    lsm_weight: 0
+    text_encoder_conf:
+        name: "none"
+    llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
+        input_size: !ref <text_encoder_input_size>
+        pretrain_path: !ref <basemodel_path>
+    sampling: !name:inspiremusic.utils.common.topk_sampling
+        top_k: 350
+    train_cfg_ratio: 0.2
+    infer_cfg_ratio: 3.0
+flow: !new:inspiremusic.flow.flow.MaskedDiff
+    input_size: 256
+    output_size: 80
+    output_type: 'mel'
+    vocab_size: 4096
+    input_frame_rate: 75
+    only_mask_loss: True
+    encoder: !new:inspiremusic.transformer.encoder.ConformerEncoder
+        output_size: 512
+        attention_heads: 4
+        linear_units: 1024
+        num_blocks: 3
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.1
+        normalize_before: True
+        input_layer: 'linear'
+        pos_enc_layer_type: 'rel_pos_espnet'
+        selfattention_layer_type: 'rel_selfattn'
+        input_size: 256
+        use_cnn_module: False
+        macaron_style: False
+    length_regulator: !new:inspiremusic.flow.length_regulator.InterpolateRegulator
+        channels: 512
+        sampling_ratios: [1, 1, 1, 1]
+    decoder: !new:inspiremusic.flow.flow_matching.ConditionalCFM
+        in_channels: 240
+        cfm_params: !new:omegaconf.DictConfig
+            content:
+                sigma_min: 1e-06
+                solver: 'euler'
+                t_scheduler: 'cosine'
+                training_cfg_rate: 0.2
+                inference_cfg_rate: 0.7
+                reg_loss_type: 'l1'
+        estimator: !new:inspiremusic.flow.decoder.ConditionalDecoder
+            in_channels: 1024
+            out_channels: 512
+            channels: [256, 256]
+            dropout: 0.0
+            attention_head_dim: 64
+            n_blocks: 4
+            num_mid_blocks: 8
+            num_heads: 8
+            act_fn: 'gelu'
+    generator_model_dir: !ref <generator_path>
+hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
+    in_channels: 80
+    base_channels: 512
+    nb_harmonics: 8
+    sampling_rate: !ref <sample_rate>
+    nsf_alpha: 0.1
+    nsf_sigma: 0.003
+    nsf_voiced_threshold: 10
+    upsample_rates: [8, 8]
+    upsample_kernel_sizes: [16, 16]
+    istft_params:
+        n_fft: 16
+        hop_len: 4
+    resblock_kernel_sizes: [3, 7, 11]
+    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+    source_resblock_kernel_sizes: [7, 11]
+    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
+    lrelu_slope: 0.1
+    audio_limit: 0.99
+    f0_predictor: !new:inspiremusic.hifigan.f0_predictor.ConvRNNF0Predictor
+        num_class: 1
+        in_channels: 80
+        cond_channels: 512
+wavtokenizer: !new:inspiremusic.hifigan.generator.HiFTGenerator
+# processor functions
+parquet_opener: !name:inspiremusic.dataset.processor.parquet_opener
+get_tokenizer: !name:inspiremusic.text.tokenizer.get_tokenizer
+    tokenizer_path: !ref <basemodel_path>
+    tokenizer_name: "qwen-2.0"
+allowed_special: 'all'
+tokenize: !name:inspiremusic.dataset.processor.tokenize
+    get_tokenizer: !ref <get_tokenizer>
+    allowed_special: !ref <allowed_special>
+filter: !name:inspiremusic.dataset.processor.filter
+    max_length: 20000
+    min_length: 1
+    token_max_length: 200
+    token_min_length: 1
+    max_acoustic_length: 20000
+    min_acoustic_length: 1800
+    mode: 'train_flow'
+resample: !name:inspiremusic.dataset.processor.resample
+    resample_rate: !ref <sample_rate>
+feat_extractor: !name:matcha.utils.audio.mel_spectrogram
+    n_fft: 1024
+    num_mels: 128
+    sampling_rate: !ref <sample_rate>
+    hop_size: 256
+    win_size: 1024
+    fmin: 0
+    fmax: 24000
+    center: False
+compute_fbank: !name:inspiremusic.dataset.processor.compute_fbank
+    feat_extractor: !ref <feat_extractor>
+parse_embedding: !name:inspiremusic.dataset.processor.parse_embedding
+    normalize: True
+shuffle: !name:inspiremusic.dataset.processor.shuffle
+    shuffle_size: 1000
+sort: !name:inspiremusic.dataset.processor.sort
+    sort_size: 500  # sort_size should be less than shuffle_size
+batch: !name:inspiremusic.dataset.processor.batch
+    batch_type: 'dynamic'
+    max_frames_in_batch: 15500 # llm 12000
+    # batch_type: 'static'
+    # batch_size: 2 # llm 12000
+padding: !name:inspiremusic.dataset.processor.padding
+    mode: 'train'
+# dataset processor pipeline
+data_pipeline: [
+    !ref <parquet_opener>,
+    !ref <tokenize>,
+    !ref <shuffle>,
+    !ref <sort>,
+    !ref <filter>,
+    !ref <batch>,
+    !ref <padding>,
+]
+# train conf
+train_conf:
+    optim: adam
+    optim_conf:
+        lr: 0.0001 # change to 0.001 if you want to train flow from scratch
+    scheduler: warmuplr
+    scheduler_conf:
+        warmup_steps: 500
+    max_epoch: 200
+    grad_clip: 5
+    accum_grad: 2
+    log_interval: 100
+    save_per_step: 500