chong.zhang commited on
Commit
3460b5c
·
1 Parent(s): 16850a3
app.py CHANGED
@@ -25,6 +25,7 @@ import sys
25
 
26
  os.system('nvidia-smi')
27
  os.system('apt update -y && apt-get install -y apt-utils && apt install -y unzip')
 
28
  # os.system('pip install flash-attn --no-build-isolation')
29
  # os.system('git submodule update --init --recursive')
30
  # os.system('git clone https://github.com/shivammehta25/Matcha-TTS.git third_party/')
 
25
 
26
  os.system('nvidia-smi')
27
  os.system('apt update -y && apt-get install -y apt-utils && apt install -y unzip')
28
+ os.environ['PYTHONPATH'] = 'third_party/Matcha-TTS'
29
  # os.system('pip install flash-attn --no-build-isolation')
30
  # os.system('git submodule update --init --recursive')
31
  # os.system('git clone https://github.com/shivammehta25/Matcha-TTS.git third_party/')
example/conf/InspireMusic-1.5B-24kHz.yaml ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set random seed, so that you may reproduce your result.
2
+ __set_seed1: !apply:random.seed [1024]
3
+ __set_seed2: !apply:numpy.random.seed [1024]
4
+ __set_seed3: !apply:torch.manual_seed [1024]
5
+ __set_seed4: !apply:torch.cuda.manual_seed_all [1024]
6
+
7
+ # fixed params
8
+ sample_rate: 24000
9
+ text_encoder_input_size: 512
10
+ llm_input_size: 1536
11
+ llm_output_size: 1536
12
+
13
+ basemodel_path: 'pretrained_models/InspireMusic-1.5B-24kHz/'
14
+ generator_path: 'pretrained_models/InspireMusic-1.5B-24kHz/music_tokenizer'
15
+
16
+ # model params
17
+ # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
18
+ # for system/third_party class/function, we do not require this.
19
+ llm: !new:inspiremusic.llm.llm.LLM
20
+ text_encoder_input_size: !ref <text_encoder_input_size>
21
+ llm_input_size: !ref <llm_input_size>
22
+ llm_output_size: !ref <llm_output_size>
23
+ audio_token_size: 4096
24
+ length_normalized_loss: True
25
+ lsm_weight: 0
26
+ text_encoder_conf:
27
+ name: "none"
28
+ llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
29
+ input_size: !ref <text_encoder_input_size>
30
+ pretrain_path: !ref <basemodel_path>
31
+
32
+ sampling: !name:inspiremusic.utils.common.topk_sampling
33
+ top_k: 350
34
+ train_cfg_ratio: 0.2
35
+ infer_cfg_ratio: 3.0
36
+ flow: !new:inspiremusic.flow.flow.MaskedDiff
37
+ input_size: 256
38
+ output_size: 80
39
+ output_type: 'mel'
40
+ vocab_size: 4096
41
+ input_frame_rate: 75
42
+ only_mask_loss: True
43
+ encoder: !new:inspiremusic.transformer.encoder.ConformerEncoder
44
+ output_size: 512
45
+ attention_heads: 4
46
+ linear_units: 1024
47
+ num_blocks: 3
48
+ dropout_rate: 0.1
49
+ positional_dropout_rate: 0.1
50
+ attention_dropout_rate: 0.1
51
+ normalize_before: True
52
+ input_layer: 'linear'
53
+ pos_enc_layer_type: 'rel_pos_espnet'
54
+ selfattention_layer_type: 'rel_selfattn'
55
+ input_size: 256
56
+ use_cnn_module: False
57
+ macaron_style: False
58
+ length_regulator: !new:inspiremusic.flow.length_regulator.InterpolateRegulator
59
+ channels: 512
60
+ sampling_ratios: [1, 1, 1, 1]
61
+ decoder: !new:inspiremusic.flow.flow_matching.ConditionalCFM
62
+ in_channels: 240
63
+ cfm_params: !new:omegaconf.DictConfig
64
+ content:
65
+ sigma_min: 1e-06
66
+ solver: 'euler'
67
+ t_scheduler: 'cosine'
68
+ training_cfg_rate: 0.2
69
+ inference_cfg_rate: 0.7
70
+ reg_loss_type: 'l1'
71
+ estimator: !new:inspiremusic.flow.decoder.ConditionalDecoder
72
+ in_channels: 1024
73
+ out_channels: 512
74
+ channels: [256, 256]
75
+ dropout: 0.0
76
+ attention_head_dim: 64
77
+ n_blocks: 4
78
+ num_mid_blocks: 8
79
+ num_heads: 8
80
+ act_fn: 'gelu'
81
+ generator_model_dir: !ref <generator_path>
82
+
83
+ hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
84
+ in_channels: 80
85
+ base_channels: 512
86
+ nb_harmonics: 8
87
+ sampling_rate: !ref <sample_rate>
88
+ nsf_alpha: 0.1
89
+ nsf_sigma: 0.003
90
+ nsf_voiced_threshold: 10
91
+ upsample_rates: [8, 8]
92
+ upsample_kernel_sizes: [16, 16]
93
+ istft_params:
94
+ n_fft: 16
95
+ hop_len: 4
96
+ resblock_kernel_sizes: [3, 7, 11]
97
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
98
+ source_resblock_kernel_sizes: [7, 11]
99
+ source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
100
+ lrelu_slope: 0.1
101
+ audio_limit: 0.99
102
+ f0_predictor: !new:inspiremusic.hifigan.f0_predictor.ConvRNNF0Predictor
103
+ num_class: 1
104
+ in_channels: 80
105
+ cond_channels: 512
106
+
107
+ wavtokenizer: !new:inspiremusic.hifigan.generator.HiFTGenerator
108
+
109
+ # processor functions
110
+ parquet_opener: !name:inspiremusic.dataset.processor.parquet_opener
111
+ get_tokenizer: !name:inspiremusic.text.tokenizer.get_tokenizer
112
+ tokenizer_path: !ref <basemodel_path>
113
+ tokenizer_name: "qwen-2.5"
114
+ allowed_special: 'all'
115
+ tokenize: !name:inspiremusic.dataset.processor.tokenize
116
+ get_tokenizer: !ref <get_tokenizer>
117
+ allowed_special: !ref <allowed_special>
118
+ filter: !name:inspiremusic.dataset.processor.filter
119
+ max_length: 28000
120
+ min_length: 0
121
+ token_max_length: 200
122
+ token_min_length: 1
123
+ resample: !name:inspiremusic.dataset.processor.resample
124
+ resample_rate: !ref <sample_rate>
125
+ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
126
+ n_fft: 1024
127
+ num_mels: 128
128
+ sampling_rate: !ref <sample_rate>
129
+ hop_size: 256
130
+ win_size: 1024
131
+ fmin: 0
132
+ fmax: 24000
133
+ center: False
134
+ compute_fbank: !name:inspiremusic.dataset.processor.compute_fbank
135
+ feat_extractor: !ref <feat_extractor>
136
+ parse_embedding: !name:inspiremusic.dataset.processor.parse_embedding
137
+ normalize: True
138
+ shuffle: !name:inspiremusic.dataset.processor.shuffle
139
+ shuffle_size: 1000
140
+ sort: !name:inspiremusic.dataset.processor.sort
141
+ sort_size: 500 # sort_size should be less than shuffle_size
142
+ batch: !name:inspiremusic.dataset.processor.batch
143
+ batch_type: 'dynamic'
144
+ max_frames_in_batch: 10000 # llm 12000
145
+ padding: !name:inspiremusic.dataset.processor.padding
146
+
147
+ # dataset processor pipeline
148
+ data_pipeline: [
149
+ !ref <parquet_opener>,
150
+ !ref <tokenize>,
151
+ !ref <shuffle>,
152
+ !ref <sort>,
153
+ !ref <filter>,
154
+ !ref <batch>,
155
+ !ref <padding>,
156
+ ]
157
+
158
+
159
+ # train conf
160
+ train_conf:
161
+ optim: adam
162
+ optim_conf:
163
+ lr: 0.0001 # change to 0.001 if you want to train flow from scratch
164
+ scheduler: warmuplr
165
+ scheduler_conf:
166
+ warmup_steps: 5000
167
+ max_epoch: 200
168
+ grad_clip: 5
169
+ accum_grad: 2
170
+ log_interval: 100
171
+ save_per_step: 500
example/conf/InspireMusic-1.5B-Long.yaml ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set random seed, so that you may reproduce your result.
2
+ __set_seed1: !apply:random.seed [1988]
3
+ __set_seed2: !apply:numpy.random.seed [1988]
4
+ __set_seed3: !apply:torch.manual_seed [1988]
5
+ __set_seed4: !apply:torch.cuda.manual_seed_all [1988]
6
+
7
+ # fixed params
8
+ sample_rate: 24000
9
+ text_encoder_input_size: 512
10
+ llm_input_size: 1536
11
+ llm_output_size: 1536
12
+
13
+ basemodel_path: 'pretrained_models/InspireMusic-1.5B-Long/'
14
+ generator_path: 'pretrained_models/InspireMusic-1.5B-Long/music_tokenizer'
15
+
16
+ # model params
17
+ # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
18
+ # for system/third_party class/function, we do not require this.
19
+ llm: !new:inspiremusic.llm.llm.LLM
20
+ text_encoder_input_size: !ref <text_encoder_input_size>
21
+ llm_input_size: !ref <llm_input_size>
22
+ llm_output_size: !ref <llm_output_size>
23
+ audio_token_size: 4096
24
+ length_normalized_loss: True
25
+ lsm_weight: 0
26
+ text_encoder_conf:
27
+ name: "none"
28
+ llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
29
+ input_size: !ref <text_encoder_input_size>
30
+ pretrain_path: !ref <basemodel_path>
31
+
32
+ sampling: !name:inspiremusic.utils.common.topk_sampling
33
+ top_k: 350
34
+ train_cfg_ratio: 0.2
35
+ infer_cfg_ratio: 3.0
36
+ flow: !new:inspiremusic.flow.flow.MaskedDiff
37
+ input_size: 256
38
+ output_size: 80
39
+ output_type: 'mel'
40
+ vocab_size: 4096
41
+ input_frame_rate: 75
42
+ only_mask_loss: True
43
+ encoder: !new:inspiremusic.transformer.encoder.ConformerEncoder
44
+ output_size: 512
45
+ attention_heads: 4
46
+ linear_units: 1024
47
+ num_blocks: 3
48
+ dropout_rate: 0.1
49
+ positional_dropout_rate: 0.1
50
+ attention_dropout_rate: 0.1
51
+ normalize_before: True
52
+ input_layer: 'linear'
53
+ pos_enc_layer_type: 'rel_pos_espnet'
54
+ selfattention_layer_type: 'rel_selfattn'
55
+ input_size: 256
56
+ use_cnn_module: False
57
+ macaron_style: False
58
+ length_regulator: !new:inspiremusic.flow.length_regulator.InterpolateRegulator
59
+ channels: 512
60
+ sampling_ratios: [1, 1, 1, 1]
61
+ decoder: !new:inspiremusic.flow.flow_matching.ConditionalCFM
62
+ in_channels: 240
63
+ cfm_params: !new:omegaconf.DictConfig
64
+ content:
65
+ sigma_min: 1e-06
66
+ solver: 'euler'
67
+ t_scheduler: 'cosine'
68
+ training_cfg_rate: 0.2
69
+ inference_cfg_rate: 0.7
70
+ reg_loss_type: 'l1'
71
+ estimator: !new:inspiremusic.flow.decoder.ConditionalDecoder
72
+ in_channels: 1024
73
+ out_channels: 512
74
+ channels: [256, 256]
75
+ dropout: 0.0
76
+ attention_head_dim: 64
77
+ n_blocks: 4
78
+ num_mid_blocks: 8
79
+ num_heads: 8
80
+ act_fn: 'gelu'
81
+ generator_model_dir: !ref <generator_path>
82
+
83
+ hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
84
+ in_channels: 80
85
+ base_channels: 512
86
+ nb_harmonics: 8
87
+ sampling_rate: !ref <sample_rate>
88
+ nsf_alpha: 0.1
89
+ nsf_sigma: 0.003
90
+ nsf_voiced_threshold: 10
91
+ upsample_rates: [8, 8]
92
+ upsample_kernel_sizes: [16, 16]
93
+ istft_params:
94
+ n_fft: 16
95
+ hop_len: 4
96
+ resblock_kernel_sizes: [3, 7, 11]
97
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
98
+ source_resblock_kernel_sizes: [7, 11]
99
+ source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
100
+ lrelu_slope: 0.1
101
+ audio_limit: 0.99
102
+ f0_predictor: !new:inspiremusic.hifigan.f0_predictor.ConvRNNF0Predictor
103
+ num_class: 1
104
+ in_channels: 80
105
+ cond_channels: 512
106
+
107
+ wavtokenizer: !new:inspiremusic.hifigan.generator.HiFTGenerator
108
+
109
+ # processor functions
110
+ parquet_opener: !name:inspiremusic.dataset.processor.parquet_opener
111
+ get_tokenizer: !name:inspiremusic.text.tokenizer.get_tokenizer
112
+ tokenizer_path: !ref <basemodel_path>
113
+ tokenizer_name: "qwen-2.5"
114
+ allowed_special: 'all'
115
+ tokenize: !name:inspiremusic.dataset.processor.tokenize
116
+ get_tokenizer: !ref <get_tokenizer>
117
+ allowed_special: !ref <allowed_special>
118
+ filter: !name:inspiremusic.dataset.processor.filter
119
+ max_length: 28000
120
+ min_length: 0
121
+ token_max_length: 200
122
+ token_min_length: 1
123
+ resample: !name:inspiremusic.dataset.processor.resample
124
+ resample_rate: !ref <sample_rate>
125
+ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
126
+ n_fft: 1024
127
+ num_mels: 128
128
+ sampling_rate: !ref <sample_rate>
129
+ hop_size: 256
130
+ win_size: 1024
131
+ fmin: 0
132
+ fmax: 24000
133
+ center: False
134
+ compute_fbank: !name:inspiremusic.dataset.processor.compute_fbank
135
+ feat_extractor: !ref <feat_extractor>
136
+ parse_embedding: !name:inspiremusic.dataset.processor.parse_embedding
137
+ normalize: True
138
+ shuffle: !name:inspiremusic.dataset.processor.shuffle
139
+ shuffle_size: 1000
140
+ sort: !name:inspiremusic.dataset.processor.sort
141
+ sort_size: 500 # sort_size should be less than shuffle_size
142
+ batch: !name:inspiremusic.dataset.processor.batch
143
+ batch_type: 'dynamic'
144
+ max_frames_in_batch: 10000 # llm 12000
145
+ padding: !name:inspiremusic.dataset.processor.padding
146
+
147
+ # dataset processor pipeline
148
+ data_pipeline: [
149
+ !ref <parquet_opener>,
150
+ !ref <tokenize>,
151
+ !ref <shuffle>,
152
+ !ref <sort>,
153
+ !ref <filter>,
154
+ !ref <batch>,
155
+ !ref <padding>,
156
+ ]
157
+
158
+
159
+ # train conf
160
+ train_conf:
161
+ optim: adam
162
+ optim_conf:
163
+ lr: 0.0001 # change to 0.001 if you want to train flow from scratch
164
+ scheduler: warmuplr
165
+ scheduler_conf:
166
+ warmup_steps: 5000
167
+ max_epoch: 200
168
+ grad_clip: 5
169
+ accum_grad: 2
170
+ log_interval: 100
171
+ save_per_step: 500
example/conf/InspireMusic-1.5B.yaml ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set random seed, so that you may reproduce your result.
2
+ __set_seed1: !apply:random.seed [1988]
3
+ __set_seed2: !apply:numpy.random.seed [1988]
4
+ __set_seed3: !apply:torch.manual_seed [1988]
5
+ __set_seed4: !apply:torch.cuda.manual_seed_all [1988]
6
+
7
+ # fixed params
8
+ sample_rate: 24000
9
+ text_encoder_input_size: 512
10
+ llm_input_size: 1536
11
+ llm_output_size: 1536
12
+
13
+ basemodel_path: 'pretrained_models/InspireMusic-1.5B/'
14
+ generator_path: 'pretrained_models/InspireMusic-1.5B/music_tokenizer'
15
+
16
+ # model params
17
+ # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
18
+ # for system/third_party class/function, we do not require this.
19
+ llm: !new:inspiremusic.llm.llm.LLM
20
+ text_encoder_input_size: !ref <text_encoder_input_size>
21
+ llm_input_size: !ref <llm_input_size>
22
+ llm_output_size: !ref <llm_output_size>
23
+ audio_token_size: 4096
24
+ length_normalized_loss: True
25
+ lsm_weight: 0
26
+ text_encoder_conf:
27
+ name: "none"
28
+ llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
29
+ input_size: !ref <text_encoder_input_size>
30
+ pretrain_path: !ref <basemodel_path>
31
+
32
+ sampling: !name:inspiremusic.utils.common.topk_sampling
33
+ top_k: 350
34
+ train_cfg_ratio: 0.2
35
+ infer_cfg_ratio: 3.0
36
+ flow: !new:inspiremusic.flow.flow.MaskedDiff
37
+ input_size: 256
38
+ output_size: 80
39
+ output_type: 'mel'
40
+ vocab_size: 4096
41
+ input_frame_rate: 75
42
+ only_mask_loss: True
43
+ encoder: !new:inspiremusic.transformer.encoder.ConformerEncoder
44
+ output_size: 512
45
+ attention_heads: 4
46
+ linear_units: 1024
47
+ num_blocks: 3
48
+ dropout_rate: 0.1
49
+ positional_dropout_rate: 0.1
50
+ attention_dropout_rate: 0.1
51
+ normalize_before: True
52
+ input_layer: 'linear'
53
+ pos_enc_layer_type: 'rel_pos_espnet'
54
+ selfattention_layer_type: 'rel_selfattn'
55
+ input_size: 256
56
+ use_cnn_module: False
57
+ macaron_style: False
58
+ length_regulator: !new:inspiremusic.flow.length_regulator.InterpolateRegulator
59
+ channels: 512
60
+ sampling_ratios: [1, 1, 1, 1]
61
+ decoder: !new:inspiremusic.flow.flow_matching.ConditionalCFM
62
+ in_channels: 240
63
+ cfm_params: !new:omegaconf.DictConfig
64
+ content:
65
+ sigma_min: 1e-06
66
+ solver: 'euler'
67
+ t_scheduler: 'cosine'
68
+ training_cfg_rate: 0.2
69
+ inference_cfg_rate: 0.7
70
+ reg_loss_type: 'l1'
71
+ estimator: !new:inspiremusic.flow.decoder.ConditionalDecoder
72
+ in_channels: 1024
73
+ out_channels: 512
74
+ channels: [256, 256]
75
+ dropout: 0.0
76
+ attention_head_dim: 64
77
+ n_blocks: 4
78
+ num_mid_blocks: 8
79
+ num_heads: 8
80
+ act_fn: 'gelu'
81
+ generator_model_dir: !ref <generator_path>
82
+
83
+ hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
84
+ in_channels: 80
85
+ base_channels: 512
86
+ nb_harmonics: 8
87
+ sampling_rate: !ref <sample_rate>
88
+ nsf_alpha: 0.1
89
+ nsf_sigma: 0.003
90
+ nsf_voiced_threshold: 10
91
+ upsample_rates: [8, 8]
92
+ upsample_kernel_sizes: [16, 16]
93
+ istft_params:
94
+ n_fft: 16
95
+ hop_len: 4
96
+ resblock_kernel_sizes: [3, 7, 11]
97
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
98
+ source_resblock_kernel_sizes: [7, 11]
99
+ source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
100
+ lrelu_slope: 0.1
101
+ audio_limit: 0.99
102
+ f0_predictor: !new:inspiremusic.hifigan.f0_predictor.ConvRNNF0Predictor
103
+ num_class: 1
104
+ in_channels: 80
105
+ cond_channels: 512
106
+
107
+ wavtokenizer: !new:inspiremusic.hifigan.generator.HiFTGenerator
108
+
109
+ # processor functions
110
+ parquet_opener: !name:inspiremusic.dataset.processor.parquet_opener
111
+ get_tokenizer: !name:inspiremusic.text.tokenizer.get_tokenizer
112
+ tokenizer_path: !ref <basemodel_path>
113
+ tokenizer_name: "qwen-2.5"
114
+ allowed_special: 'all'
115
+ tokenize: !name:inspiremusic.dataset.processor.tokenize
116
+ get_tokenizer: !ref <get_tokenizer>
117
+ allowed_special: !ref <allowed_special>
118
+ filter: !name:inspiremusic.dataset.processor.filter
119
+ max_length: 28000
120
+ min_length: 0
121
+ token_max_length: 200
122
+ token_min_length: 1
123
+ resample: !name:inspiremusic.dataset.processor.resample
124
+ resample_rate: !ref <sample_rate>
125
+ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
126
+ n_fft: 1024
127
+ num_mels: 128
128
+ sampling_rate: !ref <sample_rate>
129
+ hop_size: 256
130
+ win_size: 1024
131
+ fmin: 0
132
+ fmax: 24000
133
+ center: False
134
+ compute_fbank: !name:inspiremusic.dataset.processor.compute_fbank
135
+ feat_extractor: !ref <feat_extractor>
136
+ parse_embedding: !name:inspiremusic.dataset.processor.parse_embedding
137
+ normalize: True
138
+ shuffle: !name:inspiremusic.dataset.processor.shuffle
139
+ shuffle_size: 1000
140
+ sort: !name:inspiremusic.dataset.processor.sort
141
+ sort_size: 500 # sort_size should be less than shuffle_size
142
+ batch: !name:inspiremusic.dataset.processor.batch
143
+ batch_type: 'dynamic'
144
+ max_frames_in_batch: 10000 # llm 12000
145
+ padding: !name:inspiremusic.dataset.processor.padding
146
+
147
+ # dataset processor pipeline
148
+ data_pipeline: [
149
+ !ref <parquet_opener>,
150
+ !ref <tokenize>,
151
+ !ref <shuffle>,
152
+ !ref <sort>,
153
+ !ref <filter>,
154
+ !ref <batch>,
155
+ !ref <padding>,
156
+ ]
157
+
158
+
159
+ # train conf
160
+ train_conf:
161
+ optim: adam
162
+ optim_conf:
163
+ lr: 0.0001 # change to 0.001 if you want to train flow from scratch
164
+ scheduler: warmuplr
165
+ scheduler_conf:
166
+ warmup_steps: 5000
167
+ max_epoch: 200
168
+ grad_clip: 5
169
+ accum_grad: 2
170
+ log_interval: 100
171
+ save_per_step: 500
example/conf/InspireMusic-Base-24kHz.yaml ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set random seed, so that you may reproduce your result.
2
+ __set_seed1: !apply:random.seed [1024]
3
+ __set_seed2: !apply:numpy.random.seed [1024]
4
+ __set_seed3: !apply:torch.manual_seed [1024]
5
+ __set_seed4: !apply:torch.cuda.manual_seed_all [1024]
6
+
7
+ # fixed params
8
+ sample_rate: 24000
9
+ text_encoder_input_size: 512
10
+ llm_input_size: 896
11
+ llm_output_size: 896
12
+
13
+ basemodel_path: 'pretrained_models/InspireMusic-Base-24kHz/'
14
+ generator_path: 'pretrained_models/InspireMusic-Base-24kHz/music_tokenizer'
15
+
16
+ # model params
17
+ # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
18
+ # for system/third_party class/function, we do not require this.
19
+ llm: !new:inspiremusic.llm.llm.LLM
20
+ text_encoder_input_size: !ref <text_encoder_input_size>
21
+ llm_input_size: !ref <llm_input_size>
22
+ llm_output_size: !ref <llm_output_size>
23
+ audio_token_size: 4096
24
+ length_normalized_loss: True
25
+ lsm_weight: 0
26
+ text_encoder_conf:
27
+ name: "none"
28
+ llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
29
+ input_size: !ref <text_encoder_input_size>
30
+ pretrain_path: !ref <basemodel_path>
31
+
32
+ sampling: !name:inspiremusic.utils.common.topk_sampling
33
+ top_k: 350
34
+ train_cfg_ratio: 0.2
35
+ infer_cfg_ratio: 7.0
36
+ flow: !new:inspiremusic.flow.flow.MaskedDiff
37
+ input_size: 256
38
+ output_size: 80
39
+ output_type: 'mel'
40
+ vocab_size: 4096
41
+ input_frame_rate: 75
42
+ only_mask_loss: True
43
+ encoder: !new:inspiremusic.transformer.encoder.ConformerEncoder
44
+ output_size: 512
45
+ attention_heads: 4
46
+ linear_units: 1024
47
+ num_blocks: 3
48
+ dropout_rate: 0.1
49
+ positional_dropout_rate: 0.1
50
+ attention_dropout_rate: 0.1
51
+ normalize_before: True
52
+ input_layer: 'linear'
53
+ pos_enc_layer_type: 'rel_pos_espnet'
54
+ selfattention_layer_type: 'rel_selfattn'
55
+ input_size: 256
56
+ use_cnn_module: False
57
+ macaron_style: False
58
+ length_regulator: !new:inspiremusic.flow.length_regulator.InterpolateRegulator
59
+ channels: 512
60
+ sampling_ratios: [1, 1, 1, 1]
61
+ decoder: !new:inspiremusic.flow.flow_matching.ConditionalCFM
62
+ in_channels: 240
63
+ cfm_params: !new:omegaconf.DictConfig
64
+ content:
65
+ sigma_min: 1e-06
66
+ solver: 'euler'
67
+ t_scheduler: 'cosine'
68
+ training_cfg_rate: 0.2
69
+ inference_cfg_rate: 0.7
70
+ reg_loss_type: 'l1'
71
+ estimator: !new:inspiremusic.flow.decoder.ConditionalDecoder
72
+ in_channels: 1024
73
+ out_channels: 512
74
+ channels: [256, 256]
75
+ dropout: 0.0
76
+ attention_head_dim: 64
77
+ n_blocks: 4
78
+ num_mid_blocks: 8
79
+ num_heads: 8
80
+ act_fn: 'gelu'
81
+ generator_model_dir: !ref <generator_path>
82
+
83
+ hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
84
+ in_channels: 80
85
+ base_channels: 512
86
+ nb_harmonics: 8
87
+ sampling_rate: !ref <sample_rate>
88
+ nsf_alpha: 0.1
89
+ nsf_sigma: 0.003
90
+ nsf_voiced_threshold: 10
91
+ upsample_rates: [8, 8]
92
+ upsample_kernel_sizes: [16, 16]
93
+ istft_params:
94
+ n_fft: 16
95
+ hop_len: 4
96
+ resblock_kernel_sizes: [3, 7, 11]
97
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
98
+ source_resblock_kernel_sizes: [7, 11]
99
+ source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
100
+ lrelu_slope: 0.1
101
+ audio_limit: 0.99
102
+ f0_predictor: !new:inspiremusic.hifigan.f0_predictor.ConvRNNF0Predictor
103
+ num_class: 1
104
+ in_channels: 80
105
+ cond_channels: 512
106
+
107
+ wavtokenizer: !new:inspiremusic.hifigan.generator.HiFTGenerator
108
+
109
+ # processor functions
110
+ parquet_opener: !name:inspiremusic.dataset.processor.parquet_opener
111
+ get_tokenizer: !name:inspiremusic.text.tokenizer.get_tokenizer
112
+ tokenizer_path: !ref <basemodel_path>
113
+ tokenizer_name: "qwen-2.0"
114
+ allowed_special: 'all'
115
+ tokenize: !name:inspiremusic.dataset.processor.tokenize
116
+ get_tokenizer: !ref <get_tokenizer>
117
+ allowed_special: !ref <allowed_special>
118
+ filter: !name:inspiremusic.dataset.processor.filter
119
+ max_length: 28000
120
+ min_length: 0
121
+ token_max_length: 200
122
+ token_min_length: 1
123
+ resample: !name:inspiremusic.dataset.processor.resample
124
+ resample_rate: !ref <sample_rate>
125
+ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
126
+ n_fft: 1024
127
+ num_mels: 128
128
+ sampling_rate: !ref <sample_rate>
129
+ hop_size: 256
130
+ win_size: 1024
131
+ fmin: 0
132
+ fmax: 24000
133
+ center: False
134
+ compute_fbank: !name:inspiremusic.dataset.processor.compute_fbank
135
+ feat_extractor: !ref <feat_extractor>
136
+ parse_embedding: !name:inspiremusic.dataset.processor.parse_embedding
137
+ normalize: True
138
+ shuffle: !name:inspiremusic.dataset.processor.shuffle
139
+ shuffle_size: 1000
140
+ sort: !name:inspiremusic.dataset.processor.sort
141
+ sort_size: 500 # sort_size should be less than shuffle_size
142
+ batch: !name:inspiremusic.dataset.processor.batch
143
+ batch_type: 'dynamic'
144
+ max_frames_in_batch: 10000 # llm 12000
145
+ padding: !name:inspiremusic.dataset.processor.padding
146
+
147
+ # dataset processor pipeline
148
+ data_pipeline: [
149
+ !ref <parquet_opener>,
150
+ !ref <tokenize>,
151
+ !ref <shuffle>,
152
+ !ref <sort>,
153
+ !ref <filter>,
154
+ !ref <batch>,
155
+ !ref <padding>,
156
+ ]
157
+
158
+
159
+ # train conf
160
+ train_conf:
161
+ optim: adam
162
+ optim_conf:
163
+ lr: 0.0001 # change to 0.001 if you want to train flow from scratch
164
+ scheduler: warmuplr
165
+ scheduler_conf:
166
+ warmup_steps: 5000
167
+ max_epoch: 200
168
+ grad_clip: 5
169
+ accum_grad: 2
170
+ log_interval: 100
171
+ save_per_step: 500
example/conf/InspireMusic-Base.yaml ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set random seed, so that you may reproduce your result.
2
+ __set_seed1: !apply:random.seed [1024]
3
+ __set_seed2: !apply:numpy.random.seed [1024]
4
+ __set_seed3: !apply:torch.manual_seed [1024]
5
+ __set_seed4: !apply:torch.cuda.manual_seed_all [1024]
6
+
7
+ # fixed params
8
+ sample_rate: 24000
9
+ target_sample_rate: 48000
10
+ text_encoder_input_size: 512
11
+ llm_input_size: 896
12
+ llm_output_size: 896
13
+
14
+ basemodel_path: 'pretrained_models/InspireMusic-Base/'
15
+ generator_path: 'pretrained_models/InspireMusic-Base/music_tokenizer'
16
+
17
+ # model params
18
+ # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
19
+ # for system/third_party class/function, we do not require this.
20
+ llm: !new:inspiremusic.llm.llm.LLM
21
+ text_encoder_input_size: !ref <text_encoder_input_size>
22
+ llm_input_size: !ref <llm_input_size>
23
+ llm_output_size: !ref <llm_output_size>
24
+ audio_token_size: 4096
25
+ length_normalized_loss: True
26
+ lsm_weight: 0
27
+ text_encoder_conf:
28
+ name: "none"
29
+ llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
30
+ input_size: !ref <text_encoder_input_size>
31
+ pretrain_path: !ref <basemodel_path>
32
+
33
+ sampling: !name:inspiremusic.utils.common.topk_sampling
34
+ top_k: 350
35
+ train_cfg_ratio: 0.2
36
+ infer_cfg_ratio: 3.0
37
+ flow: !new:inspiremusic.flow.flow.MaskedDiff
38
+ input_size: 256
39
+ output_size: 80
40
+ output_type: 'mel'
41
+ vocab_size: 4096
42
+ input_frame_rate: 75
43
+ only_mask_loss: True
44
+ encoder: !new:inspiremusic.transformer.encoder.ConformerEncoder
45
+ output_size: 512
46
+ attention_heads: 4
47
+ linear_units: 1024
48
+ num_blocks: 3
49
+ dropout_rate: 0.1
50
+ positional_dropout_rate: 0.1
51
+ attention_dropout_rate: 0.1
52
+ normalize_before: True
53
+ input_layer: 'linear'
54
+ pos_enc_layer_type: 'rel_pos_espnet'
55
+ selfattention_layer_type: 'rel_selfattn'
56
+ input_size: 256
57
+ use_cnn_module: False
58
+ macaron_style: False
59
+ length_regulator: !new:inspiremusic.flow.length_regulator.InterpolateRegulator
60
+ channels: 512
61
+ sampling_ratios: [1, 1, 1, 1]
62
+ decoder: !new:inspiremusic.flow.flow_matching.ConditionalCFM
63
+ in_channels: 240
64
+ cfm_params: !new:omegaconf.DictConfig
65
+ content:
66
+ sigma_min: 1e-06
67
+ solver: 'euler'
68
+ t_scheduler: 'cosine'
69
+ training_cfg_rate: 0.2
70
+ inference_cfg_rate: 0.7
71
+ reg_loss_type: 'l1'
72
+ estimator: !new:inspiremusic.flow.decoder.ConditionalDecoder
73
+ in_channels: 1024
74
+ out_channels: 512
75
+ channels: [256, 256]
76
+ dropout: 0.0
77
+ attention_head_dim: 64
78
+ n_blocks: 4
79
+ num_mid_blocks: 8
80
+ num_heads: 8
81
+ act_fn: 'gelu'
82
+ generator_model_dir: !ref <generator_path>
83
+
84
+ hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
85
+ in_channels: 80
86
+ base_channels: 512
87
+ nb_harmonics: 8
88
+ sampling_rate: !ref <sample_rate>
89
+ nsf_alpha: 0.1
90
+ nsf_sigma: 0.003
91
+ nsf_voiced_threshold: 10
92
+ upsample_rates: [8, 8]
93
+ upsample_kernel_sizes: [16, 16]
94
+ istft_params:
95
+ n_fft: 16
96
+ hop_len: 4
97
+ resblock_kernel_sizes: [3, 7, 11]
98
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
99
+ source_resblock_kernel_sizes: [7, 11]
100
+ source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
101
+ lrelu_slope: 0.1
102
+ audio_limit: 0.99
103
+ f0_predictor: !new:inspiremusic.hifigan.f0_predictor.ConvRNNF0Predictor
104
+ num_class: 1
105
+ in_channels: 80
106
+ cond_channels: 512
107
+
108
+ wavtokenizer: !new:inspiremusic.hifigan.generator.HiFTGenerator
109
+
110
+ # processor functions
111
+ parquet_opener: !name:inspiremusic.dataset.processor.parquet_opener
112
+ get_tokenizer: !name:inspiremusic.text.tokenizer.get_tokenizer
113
+ tokenizer_path: !ref <basemodel_path>
114
+ tokenizer_name: "qwen-2.0"
115
+ allowed_special: 'all'
116
+ tokenize: !name:inspiremusic.dataset.processor.tokenize
117
+ get_tokenizer: !ref <get_tokenizer>
118
+ allowed_special: !ref <allowed_special>
119
+ filter: !name:inspiremusic.dataset.processor.filter
120
+ max_length: 20000
121
+ min_length: 1
122
+ token_max_length: 200
123
+ token_min_length: 1
124
+ max_acoustic_length: 20000
125
+ min_acoustic_length: 1800
126
+ mode: 'train_flow'
127
+
128
+ resample: !name:inspiremusic.dataset.processor.resample
129
+ resample_rate: !ref <sample_rate>
130
+
131
+ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
132
+ n_fft: 1024
133
+ num_mels: 128
134
+ sampling_rate: !ref <sample_rate>
135
+ hop_size: 256
136
+ win_size: 1024
137
+ fmin: 0
138
+ fmax: 24000
139
+ center: False
140
+ compute_fbank: !name:inspiremusic.dataset.processor.compute_fbank
141
+ feat_extractor: !ref <feat_extractor>
142
+ parse_embedding: !name:inspiremusic.dataset.processor.parse_embedding
143
+ normalize: True
144
+ shuffle: !name:inspiremusic.dataset.processor.shuffle
145
+ shuffle_size: 1000
146
+ sort: !name:inspiremusic.dataset.processor.sort
147
+ sort_size: 500 # sort_size should be less than shuffle_size
148
+ batch: !name:inspiremusic.dataset.processor.batch
149
+ batch_type: 'dynamic'
150
+ max_frames_in_batch: 15500 # llm 12000
151
+ # batch_type: 'static'
152
+ # batch_size: 2 # llm 12000
153
+ padding: !name:inspiremusic.dataset.processor.padding
154
+ mode: 'train'
155
+
156
+ # dataset processor pipeline
157
+ data_pipeline: [
158
+ !ref <parquet_opener>,
159
+ !ref <tokenize>,
160
+ !ref <shuffle>,
161
+ !ref <sort>,
162
+ !ref <filter>,
163
+ !ref <batch>,
164
+ !ref <padding>,
165
+ ]
166
+
167
+
168
+ # train conf
169
+ train_conf:
170
+ optim: adam
171
+ optim_conf:
172
+ lr: 0.0001 # change to 0.001 if you want to train flow from scratch
173
+ scheduler: warmuplr
174
+ scheduler_conf:
175
+ warmup_steps: 500
176
+ max_epoch: 200
177
+ grad_clip: 5
178
+ accum_grad: 2
179
+ log_interval: 100
180
+ save_per_step: 500