Spaces:

FunAudioLLM
/

InspireMusic

Runtime error

App Files Files Community

chong.zhang commited on Apr 9

Commit

5827423

1 Parent(s): 806cba5

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +39 -0
.gitmodules +3 -0
README.md +5 -6
app.py +237 -4
example/conf/InspireMusic-1.5B-24kHz.yaml +171 -0
example/conf/InspireMusic-1.5B-Long.yaml +171 -0
example/conf/InspireMusic-1.5B.yaml +171 -0
example/conf/InspireMusic-Base-24kHz.yaml +171 -0
example/conf/InspireMusic-Base.yaml +180 -0
inspiremusic/.DS_Store +0 -0
inspiremusic/__init__.py +0 -0
inspiremusic/bin/export_jit.py +74 -0
inspiremusic/bin/export_onnx.py +112 -0
inspiremusic/bin/flow_only_infer.py +150 -0
inspiremusic/bin/inference.py +266 -0
inspiremusic/bin/train.py +194 -0
inspiremusic/cli/__init__.py +0 -0
inspiremusic/cli/frontend.py +100 -0
inspiremusic/cli/inference.py +312 -0
inspiremusic/cli/inspiremusic.py +143 -0
inspiremusic/cli/model.py +295 -0
inspiremusic/dataset/__init__.py +0 -0
inspiremusic/dataset/dataset.py +154 -0
inspiremusic/dataset/processor.py +595 -0
inspiremusic/flow/decoder.py +277 -0
inspiremusic/flow/flow.py +143 -0
inspiremusic/flow/flow_matching.py +167 -0
inspiremusic/flow/length_regulator.py +69 -0
inspiremusic/hifigan/discriminator.py +140 -0
inspiremusic/hifigan/f0_predictor.py +55 -0
inspiremusic/hifigan/generator.py +411 -0
inspiremusic/hifigan/hifigan.py +66 -0
inspiremusic/llm/llm.py +409 -0
inspiremusic/metrics/clap_score.py +135 -0
inspiremusic/metrics/openl3_fd.py +338 -0
inspiremusic/metrics/passt_kld.py +232 -0
inspiremusic/music_tokenizer/__init__.py +0 -0
inspiremusic/music_tokenizer/env.py +29 -0
inspiremusic/music_tokenizer/meldataset.py +226 -0
inspiremusic/music_tokenizer/models.py +548 -0
inspiremusic/music_tokenizer/vqvae.py +58 -0
inspiremusic/text/abs_tokenizer.py +34 -0
inspiremusic/text/tokenizer.py +76 -0
inspiremusic/transformer/__init__.py +0 -0
inspiremusic/transformer/activation.py +84 -0
inspiremusic/transformer/attention.py +328 -0
inspiremusic/transformer/convolution.py +145 -0
inspiremusic/transformer/decoder.py +396 -0
inspiremusic/transformer/decoder_layer.py +132 -0
inspiremusic/transformer/embedding.py +294 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,42 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+example/inspiremusic/inspiremusic_01.wav filter=lfs diff=lfs merge=lfs -text
+example/inspiremusic/inspiremusic_noflow_01.wav filter=lfs diff=lfs merge=lfs -text
+example/inspiremusic/inspiremusic_w_cfm_chorus.wav filter=lfs diff=lfs merge=lfs -text
+example/inspiremusic/inspiremusic_w_cfm_intro.wav filter=lfs diff=lfs merge=lfs -text
+example/inspiremusic/inspiremusic_w_cfm_verse_ras.wav filter=lfs diff=lfs merge=lfs -text
+example/inspiremusic/inspiremusic_w_cfm_verse_topk.wav filter=lfs diff=lfs merge=lfs -text
+example/inspiremusic/inspiremusic_w_cfm_verse.wav filter=lfs diff=lfs merge=lfs -text
+example/inspiremusic/inspiremusic_wo_cfm_chorus.wav filter=lfs diff=lfs merge=lfs -text
+example/inspiremusic/inspiremusic_wo_cfm_intro.wav filter=lfs diff=lfs merge=lfs -text
+example/inspiremusic/inspiremusic_wo_cfm_verse_topk.wav filter=lfs diff=lfs merge=lfs -text
+example/inspiremusic/inspiremusic_wo_cfm_verse.wav filter=lfs diff=lfs merge=lfs -text
+example/ras/chorus/chorus_01.wav filter=lfs diff=lfs merge=lfs -text
+example/ras/chorus/chorus_02.wav filter=lfs diff=lfs merge=lfs -text
+example/ras/chorus/chorus_03.wav filter=lfs diff=lfs merge=lfs -text
+example/ras/chorus/chorus_04.wav filter=lfs diff=lfs merge=lfs -text
+example/ras/chorus/chorus_05.wav filter=lfs diff=lfs merge=lfs -text
+example/ras/chorus/chorus_06.wav filter=lfs diff=lfs merge=lfs -text
+example/ras/chorus/chorus_07.wav filter=lfs diff=lfs merge=lfs -text
+example/ras/chorus/chorus_08.wav filter=lfs diff=lfs merge=lfs -text
+example/ras/chorus/chorus_09.wav filter=lfs diff=lfs merge=lfs -text
+example/ras/chorus/chorus_10.wav filter=lfs diff=lfs merge=lfs -text
+example/ras/intro/intro_01.wav filter=lfs diff=lfs merge=lfs -text
+example/ras/intro/intro_02.wav filter=lfs diff=lfs merge=lfs -text
+example/ras/intro/intro_03.wav filter=lfs diff=lfs merge=lfs -text
+example/ras/intro/intro_04.wav filter=lfs diff=lfs merge=lfs -text
+example/ras/intro/intro_05.wav filter=lfs diff=lfs merge=lfs -text
+example/ras/intro/intro_06.wav filter=lfs diff=lfs merge=lfs -text
+example/ras/outro/outro_01.wav filter=lfs diff=lfs merge=lfs -text
+example/ras/outro/outro_02.wav filter=lfs diff=lfs merge=lfs -text
+example/ras/outro/outro_03.wav filter=lfs diff=lfs merge=lfs -text
+example/ras/outro/outro_04.wav filter=lfs diff=lfs merge=lfs -text
+example/ras/verse/verse_01.wav filter=lfs diff=lfs merge=lfs -text
+example/ras/verse/verse_02.wav filter=lfs diff=lfs merge=lfs -text
+example/ras/verse/verse_03.wav filter=lfs diff=lfs merge=lfs -text
+example/ras/verse/verse_04.wav filter=lfs diff=lfs merge=lfs -text
+example/ras/verse/verse_05.wav filter=lfs diff=lfs merge=lfs -text
+example/ras/verse/verse_06.wav filter=lfs diff=lfs merge=lfs -text
+example/ras/verse/verse_07.wav filter=lfs diff=lfs merge=lfs -text
+example/ras/verse/verse_08.wav filter=lfs diff=lfs merge=lfs -text

.gitmodules ADDED Viewed

	@@ -0,0 +1,3 @@

+[submodule "third_party/Matcha-TTS"]
+	path = third_party/Matcha-TTS
+	url = https://github.com/shivammehta25/Matcha-TTS.git

README.md CHANGED Viewed

@@ -1,14 +1,13 @@
 ---
 title: InspireMusic
-emoji: 🏃
-colorFrom: blue
-colorTo: blue
 sdk: gradio
-sdk_version: 5.23.1
 app_file: app.py
 pinned: false
 license: apache-2.0
-short_description: InspireMusic
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: InspireMusic
+emoji: 🎶
+colorFrom: indigo
+colorTo: purple
 sdk: gradio
 app_file: app.py
 pinned: false
 license: apache-2.0
+short_description: Music Generation - text to music, music continuation.
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,7 +1,240 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+# Copyright (c) 2024 Alibaba Inc (authors: Chong Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+os.system('nvidia-smi')
+os.system('apt update -y && apt-get install -y apt-utils && apt install -y unzip')
+os.environ['PYTHONPATH'] = 'third_party/Matcha-TTS'
+os.system('mkdir pretrained_models && cd pretrained_models && git clone https://huggingface.co/FunAudioLLM/InspireMusic-Base.git &&git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-Long.git &&git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B.git &&git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-24kHz.git &&git clone https://huggingface.co/FunAudioLLM/InspireMusic-Base-24kHz.git && for i in InspireMusic-Base InspireMusic-Base-24kHz InspireMusic-1.5B InspireMusic-1.5B-24kHz InspireMusic-1.5B-Long; do sed -i -e "s/\.\.\/\.\.\///g" ${i}/inspiremusic.yaml; done && cd ..')
+import sys
+import torch
+print(torch.backends.cudnn.version())
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
+import spaces
 import gradio as gr
+from inspiremusic.cli.inference import InspireMusicUnified, set_env_variables
+import torchaudio
+import datetime
+import hashlib
+import threading
+import time
+import importlib
+MODELS = ["InspireMusic-1.5B-Long", "InspireMusic-1.5B", "InspireMusic-Base", "InspireMusic-1.5B-24kHz", "InspireMusic-Base-24kHz"]
+AUDIO_PROMPT_DIR = "demo/audio_prompts"
+OUTPUT_AUDIO_DIR = "demo/outputs"
+DEMO_TEXT_PROMPTS = ["Jazz music with drum beats.",
+					 "A captivating classical piano performance, this piece exudes a dynamic and intense atmosphere, showcasing intricate and expressive instrumental artistry.",
+					 "A soothing instrumental piece blending elements of light music and pop, featuring a gentle guitar rendition. The overall feel is serene and reflective, likely instrumental with no vocals.",
+					 "The instrumental rock piece features dynamic oscillations and wave-like progressions, creating an immersive and energetic atmosphere. The music is purely instrumental, with no vocals, and it blends elements of rock and post-rock for a powerful and evocative experience.",
+					 "The classical instrumental piece exudes a haunting and evocative atmosphere, characterized by its intricate guitar work and profound emotional depth.",
+					 "Experience a dynamic blend of instrumental electronic music with futuristic house vibes, featuring energetic beats and a captivating rhythm. The tracks are likely instrumental, focusing on the immersive soundscapes rather than vocal performances."]
+# Shared flag to control the process
+stop_flag = threading.Event()
+def cancel_process():
+    """
+    Sets the stop_flag to stop the long-running process.
+    """
+    stop_flag.set()
+    return "Cancellation requested. Please wait for the process to stop."
+def generate_filename():
+	hash_object = hashlib.sha256(str(int(datetime.datetime.now().timestamp())).encode())
+	hash_string = hash_object.hexdigest()
+	return hash_string
+def get_args(
+		task, text="", audio=None, model_name="InspireMusic-Base",
+		chorus="intro",
+		output_sample_rate=48000, max_generate_audio_seconds=30.0, time_start = 0.0, time_end=30.0, trim=False):
+	if "24kHz" in model_name:
+		output_sample_rate = 24000
+	if output_sample_rate == 24000:
+		fast = True
+	else:
+		fast = False
+	# This function constructs the arguments required for InspireMusic
+	args = {
+		"task"                      : task,
+		"text"                      : text,
+		"audio_prompt"              : audio,
+		"model_name"                : model_name,
+		"chorus"                    : chorus,
+		"fast"                      : fast,
+		"fade_out"                  : True,
+		"trim"                      : trim,
+		"output_sample_rate"        : output_sample_rate,
+		"min_generate_audio_seconds": 10.0,
+		"max_generate_audio_seconds": max_generate_audio_seconds,
+		"max_audio_prompt_length": 5.0,
+		"model_dir"                 : os.path.join("pretrained_models",
+												   model_name),
+		"result_dir"                : OUTPUT_AUDIO_DIR,
+		"output_fn"                 : generate_filename(),
+		"format"                    : "wav",
+		"time_start" : time_start,
+		"time_end": time_end,
+		"fade_out_duration": 1.0,
+	}
+	if args["time_start"] is None:
+		args["time_start"] = 0.0
+	args["time_end"] = args["time_start"] + args["max_generate_audio_seconds"]
+	print(args)
+	return args
+def trim_audio(audio_file, cut_seconds=5):
+	audio, sr = torchaudio.load(audio_file)
+	num_samples = cut_seconds * sr
+	cutted_audio = audio[:, :num_samples]
+	output_path = os.path.join(AUDIO_PROMPT_DIR, "audio_prompt_" + generate_filename() + ".wav")
+	torchaudio.save(output_path, cutted_audio, sr)
+	return output_path
+@spaces.GPU(duration=120)
+def music_generation(args):
+	set_env_variables()
+	model = InspireMusicUnified(
+			model_name=args["model_name"],
+			model_dir=args["model_dir"],
+			min_generate_audio_seconds=args["min_generate_audio_seconds"],
+			max_generate_audio_seconds=args["max_generate_audio_seconds"],
+			sample_rate=24000,
+			output_sample_rate=args["output_sample_rate"],
+			load_jit=True,
+			load_onnx=False,
+			fast=args["fast"],
+			result_dir=args["result_dir"])
+	output_path = model.inference(
+			task=args["task"],
+			text=args["text"],
+			audio_prompt=args["audio_prompt"],
+			chorus=args["chorus"],
+			time_start=args["time_start"],
+			time_end=args["time_end"],
+			output_fn=args["output_fn"],
+			max_audio_prompt_length=args["max_audio_prompt_length"],
+			fade_out_duration=args["fade_out_duration"],
+			output_format=args["format"],
+			fade_out_mode=args["fade_out"],
+			trim=args["trim"])
+	return output_path
+def demo_inspiremusic_t2m(text, model_name, chorus,
+					 output_sample_rate, max_generate_audio_seconds):
+	args = get_args(
+			task='text-to-music', text=text, audio=None,
+			model_name=model_name, chorus=chorus,
+			output_sample_rate=output_sample_rate,
+			max_generate_audio_seconds=max_generate_audio_seconds)
+	return music_generation(args)
+def demo_inspiremusic_con(text, audio, model_name, chorus,
+					 output_sample_rate, max_generate_audio_seconds):
+	args = get_args(
+			task='continuation', text=text, audio=trim_audio(audio, cut_seconds=5),
+			model_name=model_name, chorus=chorus,
+			output_sample_rate=output_sample_rate,
+			max_generate_audio_seconds=max_generate_audio_seconds)
+	return music_generation(args)
+def process(args, progress=gr.Progress()):
+	progress(0, desc="Starting process...")
+	idx = 1
+	for i in range(idx):
+		if stop_flag.is_set():
+			progress(i / idx, desc="Process canceled.")
+			break
+		music_generation(args)
+		time.sleep(1)
+		progress((i + 1) / idx, desc=f"Processing step {i + 1}/{idx}")
+	return "Process completed successfully."
+def main():
+	with gr.Blocks(theme=gr.themes.Soft()) as demo:
+		gr.Markdown("""
+		# InspireMusic
+		- Support music generation tasks with long-form and high audio quality, sampling rates up to 48kHz.
+		- Github: https://github.com/FunAudioLLM/InspireMusic/  |  ModelScope Studio: https://modelscope.cn/studios/iic/InspireMusic
+		- Available music generation models: [InspireMusic-1.5B-Long](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-Long), [InspireMusic-1.5B](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B), [InspireMusic-Base](https://huggingface.co/FunAudioLLM/InspireMusic-Base), [InspireMusic-1.5B-24kHz](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-24kHz), [InspireMusic-Base-24kHz](https://huggingface.co/FunAudioLLM/InspireMusic-Base-24kHz). Both on Huggingface and ModelScope.
+		- Currently only support English text prompts.
+		- This page is for demo purpose, if you want to generate long-form audio, e.g., 5mins, please try to deploy locally. Thank you for your support.
+		""")
+		with gr.Row(equal_height=True):
+			model_name = gr.Dropdown(
+					MODELS, label="Select Model Name",
+					value="InspireMusic-1.5B-Long")
+			chorus = gr.Dropdown(["intro", "verse", "chorus", "outro"],
+								 label="Chorus Mode", value="intro")
+			output_sample_rate = gr.Dropdown([48000, 24000],
+											 label="Output Audio Sample Rate (Hz)",
+											 value=48000)
+			max_generate_audio_seconds = gr.Slider(10, 300,
+												   label="Generate Audio Length (s)",
+												   value=30)
+		with gr.Row(equal_height=True):
+			text_input = gr.Textbox(label="Input Text (For Text-to-Music Task)",
+									value="Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance.")
+			audio_input = gr.Audio(
+				label="Input Audio Prompt (For Music Continuation Task)",
+				type="filepath")
+		music_output = gr.Audio(label="Generated Music", type="filepath", autoplay=True, show_download_button = True)
+		with gr.Row():
+			button = gr.Button("Submit Text-to-Music Task")
+			button.click(demo_inspiremusic_t2m,
+						 inputs=[text_input, model_name,
+								 chorus,
+								 output_sample_rate,
+								 max_generate_audio_seconds],
+						 outputs=music_output)
+			generate_button = gr.Button("Submit Music Continuation Task")
+			generate_button.click(demo_inspiremusic_con,
+								  inputs=[text_input, audio_input, model_name,
+										  chorus,
+										  output_sample_rate,
+										  max_generate_audio_seconds],
+								  outputs=music_output)
+			cancel_button = gr.Button("Cancel")
+		cancel_button.click(
+				fn=cancel_process,
+				inputs=[],
+				outputs="Cancel process."
+		)
+		t2m_examples = gr.Examples(examples=DEMO_TEXT_PROMPTS, inputs=[text_input])
+	demo.launch()
+if __name__ == '__main__':
+	os.makedirs(AUDIO_PROMPT_DIR, exist_ok=True)
+	os.makedirs(OUTPUT_AUDIO_DIR, exist_ok=True)
+	main()

example/conf/InspireMusic-1.5B-24kHz.yaml ADDED Viewed

	@@ -0,0 +1,171 @@

+# set random seed, so that you may reproduce your result.
+__set_seed1: !apply:random.seed [1024]
+__set_seed2: !apply:numpy.random.seed [1024]
+__set_seed3: !apply:torch.manual_seed [1024]
+__set_seed4: !apply:torch.cuda.manual_seed_all [1024]
+# fixed params
+sample_rate: 24000
+text_encoder_input_size: 512
+llm_input_size: 1536
+llm_output_size: 1536
+basemodel_path: 'pretrained_models/InspireMusic-1.5B-24kHz/'
+generator_path: 'pretrained_models/InspireMusic-1.5B-24kHz/music_tokenizer'
+# model params
+# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
+# for system/third_party class/function, we do not require this.
+llm: !new:inspiremusic.llm.llm.LLM
+    text_encoder_input_size: !ref <text_encoder_input_size>
+    llm_input_size: !ref <llm_input_size>
+    llm_output_size: !ref <llm_output_size>
+    audio_token_size: 4096
+    length_normalized_loss: True
+    lsm_weight: 0
+    text_encoder_conf:
+        name: "none"
+    llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
+        input_size: !ref <text_encoder_input_size>
+        pretrain_path: !ref <basemodel_path>
+    sampling: !name:inspiremusic.utils.common.topk_sampling
+        top_k: 350
+    train_cfg_ratio: 0.2
+    infer_cfg_ratio: 3.0
+flow: !new:inspiremusic.flow.flow.MaskedDiff
+    input_size: 256
+    output_size: 80
+    output_type: 'mel'
+    vocab_size: 4096
+    input_frame_rate: 75
+    only_mask_loss: True
+    encoder: !new:inspiremusic.transformer.encoder.ConformerEncoder
+        output_size: 512
+        attention_heads: 4
+        linear_units: 1024
+        num_blocks: 3
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.1
+        normalize_before: True
+        input_layer: 'linear'
+        pos_enc_layer_type: 'rel_pos_espnet'
+        selfattention_layer_type: 'rel_selfattn'
+        input_size: 256
+        use_cnn_module: False
+        macaron_style: False
+    length_regulator: !new:inspiremusic.flow.length_regulator.InterpolateRegulator
+        channels: 512
+        sampling_ratios: [1, 1, 1, 1]
+    decoder: !new:inspiremusic.flow.flow_matching.ConditionalCFM
+        in_channels: 240
+        cfm_params: !new:omegaconf.DictConfig
+            content:
+                sigma_min: 1e-06
+                solver: 'euler'
+                t_scheduler: 'cosine'
+                training_cfg_rate: 0.2
+                inference_cfg_rate: 0.7
+                reg_loss_type: 'l1'
+        estimator: !new:inspiremusic.flow.decoder.ConditionalDecoder
+            in_channels: 1024
+            out_channels: 512
+            channels: [256, 256]
+            dropout: 0.0
+            attention_head_dim: 64
+            n_blocks: 4
+            num_mid_blocks: 8
+            num_heads: 8
+            act_fn: 'gelu'
+    generator_model_dir: !ref <generator_path>
+hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
+    in_channels: 80
+    base_channels: 512
+    nb_harmonics: 8
+    sampling_rate: !ref <sample_rate>
+    nsf_alpha: 0.1
+    nsf_sigma: 0.003
+    nsf_voiced_threshold: 10
+    upsample_rates: [8, 8]
+    upsample_kernel_sizes: [16, 16]
+    istft_params:
+        n_fft: 16
+        hop_len: 4
+    resblock_kernel_sizes: [3, 7, 11]
+    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+    source_resblock_kernel_sizes: [7, 11]
+    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
+    lrelu_slope: 0.1
+    audio_limit: 0.99
+    f0_predictor: !new:inspiremusic.hifigan.f0_predictor.ConvRNNF0Predictor
+        num_class: 1
+        in_channels: 80
+        cond_channels: 512
+wavtokenizer: !new:inspiremusic.hifigan.generator.HiFTGenerator
+# processor functions
+parquet_opener: !name:inspiremusic.dataset.processor.parquet_opener
+get_tokenizer: !name:inspiremusic.text.tokenizer.get_tokenizer
+    tokenizer_path: !ref <basemodel_path>
+    tokenizer_name: "qwen-2.5"
+allowed_special: 'all'
+tokenize: !name:inspiremusic.dataset.processor.tokenize
+    get_tokenizer: !ref <get_tokenizer>
+    allowed_special: !ref <allowed_special>
+filter: !name:inspiremusic.dataset.processor.filter
+    max_length: 28000
+    min_length: 0
+    token_max_length: 200
+    token_min_length: 1
+resample: !name:inspiremusic.dataset.processor.resample
+    resample_rate: !ref <sample_rate>
+feat_extractor: !name:matcha.utils.audio.mel_spectrogram
+    n_fft: 1024
+    num_mels: 128
+    sampling_rate: !ref <sample_rate>
+    hop_size: 256
+    win_size: 1024
+    fmin: 0
+    fmax: 24000
+    center: False
+compute_fbank: !name:inspiremusic.dataset.processor.compute_fbank
+    feat_extractor: !ref <feat_extractor>
+parse_embedding: !name:inspiremusic.dataset.processor.parse_embedding
+    normalize: True
+shuffle: !name:inspiremusic.dataset.processor.shuffle
+    shuffle_size: 1000
+sort: !name:inspiremusic.dataset.processor.sort
+    sort_size: 500  # sort_size should be less than shuffle_size
+batch: !name:inspiremusic.dataset.processor.batch
+    batch_type: 'dynamic'
+    max_frames_in_batch: 10000 # llm 12000
+padding: !name:inspiremusic.dataset.processor.padding
+# dataset processor pipeline
+data_pipeline: [
+    !ref <parquet_opener>,
+    !ref <tokenize>,
+    !ref <shuffle>,
+    !ref <sort>,
+    !ref <filter>,
+    !ref <batch>,
+    !ref <padding>,
+]
+# train conf
+train_conf:
+    optim: adam
+    optim_conf:
+        lr: 0.0001 # change to 0.001 if you want to train flow from scratch
+    scheduler: warmuplr
+    scheduler_conf:
+        warmup_steps: 5000
+    max_epoch: 200
+    grad_clip: 5
+    accum_grad: 2
+    log_interval: 100
+    save_per_step: 500

example/conf/InspireMusic-1.5B-Long.yaml ADDED Viewed

	@@ -0,0 +1,171 @@

+# set random seed, so that you may reproduce your result.
+__set_seed1: !apply:random.seed [1988]
+__set_seed2: !apply:numpy.random.seed [1988]
+__set_seed3: !apply:torch.manual_seed [1988]
+__set_seed4: !apply:torch.cuda.manual_seed_all [1988]
+# fixed params
+sample_rate: 24000
+text_encoder_input_size: 512
+llm_input_size: 1536
+llm_output_size: 1536
+basemodel_path: 'pretrained_models/InspireMusic-1.5B-Long/'
+generator_path: 'pretrained_models/InspireMusic-1.5B-Long/music_tokenizer'
+# model params
+# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
+# for system/third_party class/function, we do not require this.
+llm: !new:inspiremusic.llm.llm.LLM
+    text_encoder_input_size: !ref <text_encoder_input_size>
+    llm_input_size: !ref <llm_input_size>
+    llm_output_size: !ref <llm_output_size>
+    audio_token_size: 4096
+    length_normalized_loss: True
+    lsm_weight: 0
+    text_encoder_conf:
+        name: "none"
+    llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
+        input_size: !ref <text_encoder_input_size>
+        pretrain_path: !ref <basemodel_path>
+    sampling: !name:inspiremusic.utils.common.topk_sampling
+        top_k: 350
+    train_cfg_ratio: 0.2
+    infer_cfg_ratio: 3.0
+flow: !new:inspiremusic.flow.flow.MaskedDiff
+    input_size: 256
+    output_size: 80
+    output_type: 'mel'
+    vocab_size: 4096
+    input_frame_rate: 75
+    only_mask_loss: True
+    encoder: !new:inspiremusic.transformer.encoder.ConformerEncoder
+        output_size: 512
+        attention_heads: 4
+        linear_units: 1024
+        num_blocks: 3
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.1
+        normalize_before: True
+        input_layer: 'linear'
+        pos_enc_layer_type: 'rel_pos_espnet'
+        selfattention_layer_type: 'rel_selfattn'
+        input_size: 256
+        use_cnn_module: False
+        macaron_style: False
+    length_regulator: !new:inspiremusic.flow.length_regulator.InterpolateRegulator
+        channels: 512
+        sampling_ratios: [1, 1, 1, 1]
+    decoder: !new:inspiremusic.flow.flow_matching.ConditionalCFM
+        in_channels: 240
+        cfm_params: !new:omegaconf.DictConfig
+            content:
+                sigma_min: 1e-06
+                solver: 'euler'
+                t_scheduler: 'cosine'
+                training_cfg_rate: 0.2
+                inference_cfg_rate: 0.7
+                reg_loss_type: 'l1'
+        estimator: !new:inspiremusic.flow.decoder.ConditionalDecoder
+            in_channels: 1024
+            out_channels: 512
+            channels: [256, 256]
+            dropout: 0.0
+            attention_head_dim: 64
+            n_blocks: 4
+            num_mid_blocks: 8
+            num_heads: 8
+            act_fn: 'gelu'
+    generator_model_dir: !ref <generator_path>
+hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
+    in_channels: 80
+    base_channels: 512
+    nb_harmonics: 8
+    sampling_rate: !ref <sample_rate>
+    nsf_alpha: 0.1
+    nsf_sigma: 0.003
+    nsf_voiced_threshold: 10
+    upsample_rates: [8, 8]
+    upsample_kernel_sizes: [16, 16]
+    istft_params:
+        n_fft: 16
+        hop_len: 4
+    resblock_kernel_sizes: [3, 7, 11]
+    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+    source_resblock_kernel_sizes: [7, 11]
+    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
+    lrelu_slope: 0.1
+    audio_limit: 0.99
+    f0_predictor: !new:inspiremusic.hifigan.f0_predictor.ConvRNNF0Predictor
+        num_class: 1
+        in_channels: 80
+        cond_channels: 512
+wavtokenizer: !new:inspiremusic.hifigan.generator.HiFTGenerator
+# processor functions
+parquet_opener: !name:inspiremusic.dataset.processor.parquet_opener
+get_tokenizer: !name:inspiremusic.text.tokenizer.get_tokenizer
+    tokenizer_path: !ref <basemodel_path>
+    tokenizer_name: "qwen-2.5"
+allowed_special: 'all'
+tokenize: !name:inspiremusic.dataset.processor.tokenize
+    get_tokenizer: !ref <get_tokenizer>
+    allowed_special: !ref <allowed_special>
+filter: !name:inspiremusic.dataset.processor.filter
+    max_length: 28000
+    min_length: 0
+    token_max_length: 200
+    token_min_length: 1
+resample: !name:inspiremusic.dataset.processor.resample
+    resample_rate: !ref <sample_rate>
+feat_extractor: !name:matcha.utils.audio.mel_spectrogram
+    n_fft: 1024
+    num_mels: 128
+    sampling_rate: !ref <sample_rate>
+    hop_size: 256
+    win_size: 1024
+    fmin: 0
+    fmax: 24000
+    center: False
+compute_fbank: !name:inspiremusic.dataset.processor.compute_fbank
+    feat_extractor: !ref <feat_extractor>
+parse_embedding: !name:inspiremusic.dataset.processor.parse_embedding
+    normalize: True
+shuffle: !name:inspiremusic.dataset.processor.shuffle
+    shuffle_size: 1000
+sort: !name:inspiremusic.dataset.processor.sort
+    sort_size: 500  # sort_size should be less than shuffle_size
+batch: !name:inspiremusic.dataset.processor.batch
+    batch_type: 'dynamic'
+    max_frames_in_batch: 10000 # llm 12000
+padding: !name:inspiremusic.dataset.processor.padding
+# dataset processor pipeline
+data_pipeline: [
+    !ref <parquet_opener>,
+    !ref <tokenize>,
+    !ref <shuffle>,
+    !ref <sort>,
+    !ref <filter>,
+    !ref <batch>,
+    !ref <padding>,
+]
+# train conf
+train_conf:
+    optim: adam
+    optim_conf:
+        lr: 0.0001 # change to 0.001 if you want to train flow from scratch
+    scheduler: warmuplr
+    scheduler_conf:
+        warmup_steps: 5000
+    max_epoch: 200
+    grad_clip: 5
+    accum_grad: 2
+    log_interval: 100
+    save_per_step: 500

example/conf/InspireMusic-1.5B.yaml ADDED Viewed

	@@ -0,0 +1,171 @@

+# set random seed, so that you may reproduce your result.
+__set_seed1: !apply:random.seed [1988]
+__set_seed2: !apply:numpy.random.seed [1988]
+__set_seed3: !apply:torch.manual_seed [1988]
+__set_seed4: !apply:torch.cuda.manual_seed_all [1988]
+# fixed params
+sample_rate: 24000
+text_encoder_input_size: 512
+llm_input_size: 1536
+llm_output_size: 1536
+basemodel_path: 'pretrained_models/InspireMusic-1.5B/'
+generator_path: 'pretrained_models/InspireMusic-1.5B/music_tokenizer'
+# model params
+# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
+# for system/third_party class/function, we do not require this.
+llm: !new:inspiremusic.llm.llm.LLM
+    text_encoder_input_size: !ref <text_encoder_input_size>
+    llm_input_size: !ref <llm_input_size>
+    llm_output_size: !ref <llm_output_size>
+    audio_token_size: 4096
+    length_normalized_loss: True
+    lsm_weight: 0
+    text_encoder_conf:
+        name: "none"
+    llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
+        input_size: !ref <text_encoder_input_size>
+        pretrain_path: !ref <basemodel_path>
+    sampling: !name:inspiremusic.utils.common.topk_sampling
+        top_k: 350
+    train_cfg_ratio: 0.2
+    infer_cfg_ratio: 3.0
+flow: !new:inspiremusic.flow.flow.MaskedDiff
+    input_size: 256
+    output_size: 80
+    output_type: 'mel'
+    vocab_size: 4096
+    input_frame_rate: 75
+    only_mask_loss: True
+    encoder: !new:inspiremusic.transformer.encoder.ConformerEncoder
+        output_size: 512
+        attention_heads: 4
+        linear_units: 1024
+        num_blocks: 3
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.1
+        normalize_before: True
+        input_layer: 'linear'
+        pos_enc_layer_type: 'rel_pos_espnet'
+        selfattention_layer_type: 'rel_selfattn'
+        input_size: 256
+        use_cnn_module: False
+        macaron_style: False
+    length_regulator: !new:inspiremusic.flow.length_regulator.InterpolateRegulator
+        channels: 512
+        sampling_ratios: [1, 1, 1, 1]
+    decoder: !new:inspiremusic.flow.flow_matching.ConditionalCFM
+        in_channels: 240
+        cfm_params: !new:omegaconf.DictConfig
+            content:
+                sigma_min: 1e-06
+                solver: 'euler'
+                t_scheduler: 'cosine'
+                training_cfg_rate: 0.2
+                inference_cfg_rate: 0.7
+                reg_loss_type: 'l1'
+        estimator: !new:inspiremusic.flow.decoder.ConditionalDecoder
+            in_channels: 1024
+            out_channels: 512
+            channels: [256, 256]
+            dropout: 0.0
+            attention_head_dim: 64
+            n_blocks: 4
+            num_mid_blocks: 8
+            num_heads: 8
+            act_fn: 'gelu'
+    generator_model_dir: !ref <generator_path>
+hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
+    in_channels: 80
+    base_channels: 512
+    nb_harmonics: 8
+    sampling_rate: !ref <sample_rate>
+    nsf_alpha: 0.1
+    nsf_sigma: 0.003
+    nsf_voiced_threshold: 10
+    upsample_rates: [8, 8]
+    upsample_kernel_sizes: [16, 16]
+    istft_params:
+        n_fft: 16
+        hop_len: 4
+    resblock_kernel_sizes: [3, 7, 11]
+    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+    source_resblock_kernel_sizes: [7, 11]
+    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
+    lrelu_slope: 0.1
+    audio_limit: 0.99
+    f0_predictor: !new:inspiremusic.hifigan.f0_predictor.ConvRNNF0Predictor
+        num_class: 1
+        in_channels: 80
+        cond_channels: 512
+wavtokenizer: !new:inspiremusic.hifigan.generator.HiFTGenerator
+# processor functions
+parquet_opener: !name:inspiremusic.dataset.processor.parquet_opener
+get_tokenizer: !name:inspiremusic.text.tokenizer.get_tokenizer
+    tokenizer_path: !ref <basemodel_path>
+    tokenizer_name: "qwen-2.5"
+allowed_special: 'all'
+tokenize: !name:inspiremusic.dataset.processor.tokenize
+    get_tokenizer: !ref <get_tokenizer>
+    allowed_special: !ref <allowed_special>
+filter: !name:inspiremusic.dataset.processor.filter
+    max_length: 28000
+    min_length: 0
+    token_max_length: 200
+    token_min_length: 1
+resample: !name:inspiremusic.dataset.processor.resample
+    resample_rate: !ref <sample_rate>
+feat_extractor: !name:matcha.utils.audio.mel_spectrogram
+    n_fft: 1024
+    num_mels: 128
+    sampling_rate: !ref <sample_rate>
+    hop_size: 256
+    win_size: 1024
+    fmin: 0
+    fmax: 24000
+    center: False
+compute_fbank: !name:inspiremusic.dataset.processor.compute_fbank
+    feat_extractor: !ref <feat_extractor>
+parse_embedding: !name:inspiremusic.dataset.processor.parse_embedding
+    normalize: True
+shuffle: !name:inspiremusic.dataset.processor.shuffle
+    shuffle_size: 1000
+sort: !name:inspiremusic.dataset.processor.sort
+    sort_size: 500  # sort_size should be less than shuffle_size
+batch: !name:inspiremusic.dataset.processor.batch
+    batch_type: 'dynamic'
+    max_frames_in_batch: 10000 # llm 12000
+padding: !name:inspiremusic.dataset.processor.padding
+# dataset processor pipeline
+data_pipeline: [
+    !ref <parquet_opener>,
+    !ref <tokenize>,
+    !ref <shuffle>,
+    !ref <sort>,
+    !ref <filter>,
+    !ref <batch>,
+    !ref <padding>,
+]
+# train conf
+train_conf:
+    optim: adam
+    optim_conf:
+        lr: 0.0001 # change to 0.001 if you want to train flow from scratch
+    scheduler: warmuplr
+    scheduler_conf:
+        warmup_steps: 5000
+    max_epoch: 200
+    grad_clip: 5
+    accum_grad: 2
+    log_interval: 100
+    save_per_step: 500

example/conf/InspireMusic-Base-24kHz.yaml ADDED Viewed

	@@ -0,0 +1,171 @@

+# set random seed, so that you may reproduce your result.
+__set_seed1: !apply:random.seed [1024]
+__set_seed2: !apply:numpy.random.seed [1024]
+__set_seed3: !apply:torch.manual_seed [1024]
+__set_seed4: !apply:torch.cuda.manual_seed_all [1024]
+# fixed params
+sample_rate: 24000
+text_encoder_input_size: 512
+llm_input_size: 896
+llm_output_size: 896
+basemodel_path: 'pretrained_models/InspireMusic-Base-24kHz/'
+generator_path: 'pretrained_models/InspireMusic-Base-24kHz/music_tokenizer'
+# model params
+# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
+# for system/third_party class/function, we do not require this.
+llm: !new:inspiremusic.llm.llm.LLM
+    text_encoder_input_size: !ref <text_encoder_input_size>
+    llm_input_size: !ref <llm_input_size>
+    llm_output_size: !ref <llm_output_size>
+    audio_token_size: 4096
+    length_normalized_loss: True
+    lsm_weight: 0
+    text_encoder_conf:
+        name: "none"
+    llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
+        input_size: !ref <text_encoder_input_size>
+        pretrain_path: !ref <basemodel_path>
+    sampling: !name:inspiremusic.utils.common.topk_sampling
+        top_k: 350
+    train_cfg_ratio: 0.2
+    infer_cfg_ratio: 7.0
+flow: !new:inspiremusic.flow.flow.MaskedDiff
+    input_size: 256
+    output_size: 80
+    output_type: 'mel'
+    vocab_size: 4096
+    input_frame_rate: 75
+    only_mask_loss: True
+    encoder: !new:inspiremusic.transformer.encoder.ConformerEncoder
+        output_size: 512
+        attention_heads: 4
+        linear_units: 1024
+        num_blocks: 3
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.1
+        normalize_before: True
+        input_layer: 'linear'
+        pos_enc_layer_type: 'rel_pos_espnet'
+        selfattention_layer_type: 'rel_selfattn'
+        input_size: 256
+        use_cnn_module: False
+        macaron_style: False
+    length_regulator: !new:inspiremusic.flow.length_regulator.InterpolateRegulator
+        channels: 512
+        sampling_ratios: [1, 1, 1, 1]
+    decoder: !new:inspiremusic.flow.flow_matching.ConditionalCFM
+        in_channels: 240
+        cfm_params: !new:omegaconf.DictConfig
+            content:
+                sigma_min: 1e-06
+                solver: 'euler'
+                t_scheduler: 'cosine'
+                training_cfg_rate: 0.2
+                inference_cfg_rate: 0.7
+                reg_loss_type: 'l1'
+        estimator: !new:inspiremusic.flow.decoder.ConditionalDecoder
+            in_channels: 1024
+            out_channels: 512
+            channels: [256, 256]
+            dropout: 0.0
+            attention_head_dim: 64
+            n_blocks: 4
+            num_mid_blocks: 8
+            num_heads: 8
+            act_fn: 'gelu'
+    generator_model_dir: !ref <generator_path>
+hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
+    in_channels: 80
+    base_channels: 512
+    nb_harmonics: 8
+    sampling_rate: !ref <sample_rate>
+    nsf_alpha: 0.1
+    nsf_sigma: 0.003
+    nsf_voiced_threshold: 10
+    upsample_rates: [8, 8]
+    upsample_kernel_sizes: [16, 16]
+    istft_params:
+        n_fft: 16
+        hop_len: 4
+    resblock_kernel_sizes: [3, 7, 11]
+    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+    source_resblock_kernel_sizes: [7, 11]
+    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
+    lrelu_slope: 0.1
+    audio_limit: 0.99
+    f0_predictor: !new:inspiremusic.hifigan.f0_predictor.ConvRNNF0Predictor
+        num_class: 1
+        in_channels: 80
+        cond_channels: 512
+wavtokenizer: !new:inspiremusic.hifigan.generator.HiFTGenerator
+# processor functions
+parquet_opener: !name:inspiremusic.dataset.processor.parquet_opener
+get_tokenizer: !name:inspiremusic.text.tokenizer.get_tokenizer
+    tokenizer_path: !ref <basemodel_path>
+    tokenizer_name: "qwen-2.0"
+allowed_special: 'all'
+tokenize: !name:inspiremusic.dataset.processor.tokenize
+    get_tokenizer: !ref <get_tokenizer>
+    allowed_special: !ref <allowed_special>
+filter: !name:inspiremusic.dataset.processor.filter
+    max_length: 28000
+    min_length: 0
+    token_max_length: 200
+    token_min_length: 1
+resample: !name:inspiremusic.dataset.processor.resample
+    resample_rate: !ref <sample_rate>
+feat_extractor: !name:matcha.utils.audio.mel_spectrogram
+    n_fft: 1024
+    num_mels: 128
+    sampling_rate: !ref <sample_rate>
+    hop_size: 256
+    win_size: 1024
+    fmin: 0
+    fmax: 24000
+    center: False
+compute_fbank: !name:inspiremusic.dataset.processor.compute_fbank
+    feat_extractor: !ref <feat_extractor>
+parse_embedding: !name:inspiremusic.dataset.processor.parse_embedding
+    normalize: True
+shuffle: !name:inspiremusic.dataset.processor.shuffle
+    shuffle_size: 1000
+sort: !name:inspiremusic.dataset.processor.sort
+    sort_size: 500  # sort_size should be less than shuffle_size
+batch: !name:inspiremusic.dataset.processor.batch
+    batch_type: 'dynamic'
+    max_frames_in_batch: 10000 # llm 12000
+padding: !name:inspiremusic.dataset.processor.padding
+# dataset processor pipeline
+data_pipeline: [
+    !ref <parquet_opener>,
+    !ref <tokenize>,
+    !ref <shuffle>,
+    !ref <sort>,
+    !ref <filter>,
+    !ref <batch>,
+    !ref <padding>,
+]
+# train conf
+train_conf:
+    optim: adam
+    optim_conf:
+        lr: 0.0001 # change to 0.001 if you want to train flow from scratch
+    scheduler: warmuplr
+    scheduler_conf:
+        warmup_steps: 5000
+    max_epoch: 200
+    grad_clip: 5
+    accum_grad: 2
+    log_interval: 100
+    save_per_step: 500

example/conf/InspireMusic-Base.yaml ADDED Viewed

	@@ -0,0 +1,180 @@

+# set random seed, so that you may reproduce your result.
+__set_seed1: !apply:random.seed [1024]
+__set_seed2: !apply:numpy.random.seed [1024]
+__set_seed3: !apply:torch.manual_seed [1024]
+__set_seed4: !apply:torch.cuda.manual_seed_all [1024]
+# fixed params
+sample_rate: 24000
+target_sample_rate: 48000
+text_encoder_input_size: 512
+llm_input_size: 896
+llm_output_size: 896
+basemodel_path: 'pretrained_models/InspireMusic-Base/'
+generator_path: 'pretrained_models/InspireMusic-Base/music_tokenizer'
+# model params
+# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
+# for system/third_party class/function, we do not require this.
+llm: !new:inspiremusic.llm.llm.LLM
+    text_encoder_input_size: !ref <text_encoder_input_size>
+    llm_input_size: !ref <llm_input_size>
+    llm_output_size: !ref <llm_output_size>
+    audio_token_size: 4096
+    length_normalized_loss: True
+    lsm_weight: 0
+    text_encoder_conf:
+        name: "none"
+    llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
+        input_size: !ref <text_encoder_input_size>
+        pretrain_path: !ref <basemodel_path>
+    sampling: !name:inspiremusic.utils.common.topk_sampling
+        top_k: 350
+    train_cfg_ratio: 0.2
+    infer_cfg_ratio: 3.0
+flow: !new:inspiremusic.flow.flow.MaskedDiff
+    input_size: 256
+    output_size: 80
+    output_type: 'mel'
+    vocab_size: 4096
+    input_frame_rate: 75
+    only_mask_loss: True
+    encoder: !new:inspiremusic.transformer.encoder.ConformerEncoder
+        output_size: 512
+        attention_heads: 4
+        linear_units: 1024
+        num_blocks: 3
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.1
+        normalize_before: True
+        input_layer: 'linear'
+        pos_enc_layer_type: 'rel_pos_espnet'
+        selfattention_layer_type: 'rel_selfattn'
+        input_size: 256
+        use_cnn_module: False
+        macaron_style: False
+    length_regulator: !new:inspiremusic.flow.length_regulator.InterpolateRegulator
+        channels: 512
+        sampling_ratios: [1, 1, 1, 1]
+    decoder: !new:inspiremusic.flow.flow_matching.ConditionalCFM
+        in_channels: 240
+        cfm_params: !new:omegaconf.DictConfig
+            content:
+                sigma_min: 1e-06
+                solver: 'euler'
+                t_scheduler: 'cosine'
+                training_cfg_rate: 0.2
+                inference_cfg_rate: 0.7
+                reg_loss_type: 'l1'
+        estimator: !new:inspiremusic.flow.decoder.ConditionalDecoder
+            in_channels: 1024
+            out_channels: 512
+            channels: [256, 256]
+            dropout: 0.0
+            attention_head_dim: 64
+            n_blocks: 4
+            num_mid_blocks: 8
+            num_heads: 8
+            act_fn: 'gelu'
+    generator_model_dir: !ref <generator_path>
+hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
+    in_channels: 80
+    base_channels: 512
+    nb_harmonics: 8
+    sampling_rate: !ref <sample_rate>
+    nsf_alpha: 0.1
+    nsf_sigma: 0.003
+    nsf_voiced_threshold: 10
+    upsample_rates: [8, 8]
+    upsample_kernel_sizes: [16, 16]
+    istft_params:
+        n_fft: 16
+        hop_len: 4
+    resblock_kernel_sizes: [3, 7, 11]
+    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+    source_resblock_kernel_sizes: [7, 11]
+    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
+    lrelu_slope: 0.1
+    audio_limit: 0.99
+    f0_predictor: !new:inspiremusic.hifigan.f0_predictor.ConvRNNF0Predictor
+        num_class: 1
+        in_channels: 80
+        cond_channels: 512
+wavtokenizer: !new:inspiremusic.hifigan.generator.HiFTGenerator
+# processor functions
+parquet_opener: !name:inspiremusic.dataset.processor.parquet_opener
+get_tokenizer: !name:inspiremusic.text.tokenizer.get_tokenizer
+    tokenizer_path: !ref <basemodel_path>
+    tokenizer_name: "qwen-2.0"
+allowed_special: 'all'
+tokenize: !name:inspiremusic.dataset.processor.tokenize
+    get_tokenizer: !ref <get_tokenizer>
+    allowed_special: !ref <allowed_special>
+filter: !name:inspiremusic.dataset.processor.filter
+    max_length: 20000
+    min_length: 1
+    token_max_length: 200
+    token_min_length: 1
+    max_acoustic_length: 20000
+    min_acoustic_length: 1800
+    mode: 'train_flow'
+resample: !name:inspiremusic.dataset.processor.resample
+    resample_rate: !ref <sample_rate>
+feat_extractor: !name:matcha.utils.audio.mel_spectrogram
+    n_fft: 1024
+    num_mels: 128
+    sampling_rate: !ref <sample_rate>
+    hop_size: 256
+    win_size: 1024
+    fmin: 0
+    fmax: 24000
+    center: False
+compute_fbank: !name:inspiremusic.dataset.processor.compute_fbank
+    feat_extractor: !ref <feat_extractor>
+parse_embedding: !name:inspiremusic.dataset.processor.parse_embedding
+    normalize: True
+shuffle: !name:inspiremusic.dataset.processor.shuffle
+    shuffle_size: 1000
+sort: !name:inspiremusic.dataset.processor.sort
+    sort_size: 500  # sort_size should be less than shuffle_size
+batch: !name:inspiremusic.dataset.processor.batch
+    batch_type: 'dynamic'
+    max_frames_in_batch: 15500 # llm 12000
+    # batch_type: 'static'
+    # batch_size: 2 # llm 12000
+padding: !name:inspiremusic.dataset.processor.padding
+    mode: 'train'
+# dataset processor pipeline
+data_pipeline: [
+    !ref <parquet_opener>,
+    !ref <tokenize>,
+    !ref <shuffle>,
+    !ref <sort>,
+    !ref <filter>,
+    !ref <batch>,
+    !ref <padding>,
+]
+# train conf
+train_conf:
+    optim: adam
+    optim_conf:
+        lr: 0.0001 # change to 0.001 if you want to train flow from scratch
+    scheduler: warmuplr
+    scheduler_conf:
+        warmup_steps: 500
+    max_epoch: 200
+    grad_clip: 5
+    accum_grad: 2
+    log_interval: 100
+    save_per_step: 500

inspiremusic/.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

inspiremusic/__init__.py ADDED Viewed

File without changes

inspiremusic/bin/export_jit.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import os
+import sys
+import torch
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append('{}/../..'.format(ROOT_DIR))
+sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
+from inspiremusic.cli.inspiremusic import InspireMusic
+def get_args():
+    parser = argparse.ArgumentParser(description='export your model for deployment')
+    parser.add_argument('--model_dir',
+                        type=str,
+                        default='pretrained_models/InspireMusic',
+                        help='local path')
+    args = parser.parse_args()
+    print(args)
+    return args
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    torch._C._jit_set_fusion_strategy([('STATIC', 1)])
+    torch._C._jit_set_profiling_mode(False)
+    torch._C._jit_set_profiling_executor(False)
+    inspiremusic = InspireMusic(args.model_dir, load_jit=False, load_onnx=False)
+    # 1. export llm text_encoder
+    llm_text_encoder = inspiremusic.model.llm.text_encoder.half()
+    script = torch.jit.script(llm_text_encoder)
+    script = torch.jit.freeze(script)
+    script = torch.jit.optimize_for_inference(script)
+    script.save('{}/llm.text_encoder.fp16.zip'.format(args.model_dir))
+    # 2. export llm llm
+    llm_llm = inspiremusic.model.llm.llm.half()
+    script = torch.jit.script(llm_llm)
+    script = torch.jit.freeze(script, preserved_attrs=['forward_chunk'])
+    script = torch.jit.optimize_for_inference(script)
+    script.save('{}/llm.llm.fp16.zip'.format(args.model_dir))
+    # 3. export flow encoder
+    flow_encoder = inspiremusic.model.flow.encoder
+    script = torch.jit.script(flow_encoder)
+    script = torch.jit.freeze(script)
+    script = torch.jit.optimize_for_inference(script)
+    script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
+if __name__ == '__main__':
+    main()

inspiremusic/bin/export_onnx.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# Copyright (c) 2024 Antgroup Inc (authors: Zhoubofan, [email protected])
+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import os
+import sys
+import onnxruntime
+import random
+import torch
+from tqdm import tqdm
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append('{}/../..'.format(ROOT_DIR))
+sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
+from inspiremusic.cli.inspiremusic import InspireMusic
+def get_dummy_input(batch_size, seq_len, out_channels, device):
+    x = torch.rand((batch_size, out_channels, seq_len), dtype=torch.float32, device=device)
+    mask = torch.ones((batch_size, 1, seq_len), dtype=torch.float32, device=device)
+    mu = torch.rand((batch_size, out_channels, seq_len), dtype=torch.float32, device=device)
+    t = torch.rand((batch_size), dtype=torch.float32, device=device)
+    spks = torch.rand((batch_size, out_channels), dtype=torch.float32, device=device)
+    cond = torch.rand((batch_size, out_channels, seq_len), dtype=torch.float32, device=device)
+    return x, mask, mu, t, spks, cond
+def get_args():
+    parser = argparse.ArgumentParser(description='export your model for deployment')
+    parser.add_argument('--model_dir',
+                        type=str,
+                        default='pretrained_models/InspireMusic',
+                        help='local path')
+    args = parser.parse_args()
+    print(args)
+    return args
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    inspiremusic = InspireMusic(args.model_dir, load_jit=False, load_onnx=False)
+    # 1. export flow decoder estimator
+    estimator = inspiremusic.model.flow.decoder.estimator
+    device = inspiremusic.model.device
+    batch_size, seq_len = 1, 256
+    out_channels = inspiremusic.model.flow.decoder.estimator.out_channels
+    x, mask, mu, t, spks, cond = get_dummy_input(batch_size, seq_len, out_channels, device)
+    torch.onnx.export(
+        estimator,
+        (x, mask, mu, t, spks, cond),
+        '{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
+        export_params=True,
+        opset_version=18,
+        do_constant_folding=True,
+        input_names=['x', 'mask', 'mu', 't', 'spks', 'cond'],
+        output_names=['estimator_out'],
+        dynamic_axes={
+            'x': {0: 'batch_size', 2: 'seq_len'},
+            'mask': {0: 'batch_size', 2: 'seq_len'},
+            'mu': {0: 'batch_size', 2: 'seq_len'},
+            'cond': {0: 'batch_size', 2: 'seq_len'},
+            't': {0: 'batch_size'},
+            'spks': {0: 'batch_size'},
+            'estimator_out': {0: 'batch_size', 2: 'seq_len'},
+        }
+    )
+    # 2. test computation consistency
+    option = onnxruntime.SessionOptions()
+    option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+    option.intra_op_num_threads = 1
+    providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
+    estimator_onnx = onnxruntime.InferenceSession('{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
+                                                  sess_options=option, providers=providers)
+    for _ in tqdm(range(10)):
+        x, mask, mu, t, spks, cond = get_dummy_input(random.randint(1, 6), random.randint(16, 512), out_channels, device)
+        output_pytorch = estimator(x, mask, mu, t, spks, cond)
+        ort_inputs = {
+            'x': x.cpu().numpy(),
+            'mask': mask.cpu().numpy(),
+            'mu': mu.cpu().numpy(),
+            't': t.cpu().numpy(),
+            'spks': spks.cpu().numpy(),
+            'cond': cond.cpu().numpy()
+        }
+        output_onnx = estimator_onnx.run(None, ort_inputs)[0]
+        torch.testing.assert_allclose(output_pytorch, torch.from_numpy(output_onnx).to(device), rtol=1e-2, atol=1e-4)
+if __name__ == "__main__":
+    main()

inspiremusic/bin/flow_only_infer.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import os
+import torch
+from torch.utils.data import DataLoader
+import torchaudio
+from hyperpyyaml import load_hyperpyyaml
+from tqdm import tqdm
+from inspiremusic.cli.model import InspireMusicModel
+from inspiremusic.dataset.dataset import Dataset
+from inspiremusic.utils.common import MUSIC_STRUCTURE_LABELS
+def get_args():
+    parser = argparse.ArgumentParser(description='inference only with flow model')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--prompt_data', required=True, help='prompt data file')
+    parser.add_argument('--flow_model', required=True, help='flow model file')
+    parser.add_argument('--llm_model', default=None,required=False, help='llm model file')
+    parser.add_argument('--music_tokenizer', required=True, help='music tokenizer model file')
+    parser.add_argument('--wavtokenizer', required=True, help='wavtokenizer model file')
+    parser.add_argument('--chorus', default="random",required=False, help='chorus tag generation mode, eg. random, verse, chorus, intro.')
+    parser.add_argument('--sample_rate', type=int, default=48000, required=False,
+                        help='sampling rate of generated audio')
+    parser.add_argument('--min_generate_audio_seconds', type=float, default=10.0, required=False,
+                        help='the minimum generated audio length in seconds')
+    parser.add_argument('--max_generate_audio_seconds', type=float, default=30.0, required=False,
+                        help='the maximum generated audio length in seconds')
+    parser.add_argument('--gpu',
+                        type=int,
+                        default=-1,
+                        help='gpu id for this rank, -1 for cpu')
+    parser.add_argument('--result_dir', required=True, help='asr result file')
+    args = parser.parse_args()
+    print(args)
+    return args
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
+    # Init inspiremusic models from configs
+    use_cuda = args.gpu >= 0 and torch.cuda.is_available()
+    device = torch.device('cuda' if use_cuda else 'cpu')
+    with open(args.config, 'r') as f:
+        configs = load_hyperpyyaml(f)
+    model = InspireMusicModel(None, configs['flow'], configs['hift'], configs['wavtokenizer'])
+    model.load(args.llm_model, args.flow_model, args.music_tokenizer, args.wavtokenizer)
+    if args.llm_model is None:
+        model.llm = None
+    else:
+        model.llm = model.llm.to(torch.float32)
+    if args.flow_model is None:
+        model.flow = None
+    test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=True, partition=False)
+    test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0)
+    del configs
+    os.makedirs(args.result_dir, exist_ok=True)
+    fn = os.path.join(args.result_dir, 'wav.scp')
+    f = open(fn, 'w')
+    with torch.no_grad():
+        for _, batch in tqdm(enumerate(test_data_loader)):
+            utts = batch["utts"]
+            assert len(utts) == 1, "inference mode only support batchsize 1"
+            if "semantic_token" in batch:
+                token  = batch["semantic_token"].to(device)
+                token_len  = batch["semantic_token_len"].to(device)
+            else:
+                if audio_token is None:
+                    token = None
+                    token_len = None
+                else:
+                    token = audio_token.view(audio_token.size(0),-1,4)[:,:,0]
+                    token_len  = audio_token_len / 4
+            text_token = batch["text_token"].to(device)
+            text_token_len = batch["text_token_len"].to(device)
+            text = batch["text"]
+            if "time_start" not in batch.keys():
+                batch["time_start"] = torch.randint(0, args.min_generate_audio_seconds, (1,)).to(torch.float64)
+            if "time_end" not in batch.keys():
+                batch["time_end"] = torch.randint(args.min_generate_audio_seconds, args.max_generate_audio_seconds, (1,)).to(torch.float64)
+            elif (batch["time_end"].numpy()[0] - batch["time_start"].numpy()[0]) < args.min_generate_audio_seconds:
+                batch["time_end"] = torch.randint(int(batch["time_start"].numpy()[0] + args.min_generate_audio_seconds), int(batch["time_start"].numpy()[0] + args.max_generate_audio_seconds), (1,)).to(torch.float64)
+            if "chorus" not in batch.keys():
+                batch["chorus"] = torch.randint(1, 5, (1,))
+            if args.chorus == "random":
+                batch["chorus"] = torch.randint(1, 5, (1,))
+            elif args.chorus == "intro":
+                batch["chorus"] = torch.Tensor([0])
+            elif "verse" in args.chorus:
+                batch["chorus"] = torch.Tensor([1])
+            elif args.chorus == "chorus":
+                batch["chorus"] = torch.Tensor([2])
+            elif args.chorus == "outro":
+                batch["chorus"] = torch.Tensor([4])
+            time_start = batch["time_start"].to(device)
+            time_end = batch["time_end"].to(device)
+            chorus = batch["chorus"].to(torch.int)
+            text_prompt = f"<|{batch['time_start'].numpy()[0]}|><|{MUSIC_STRUCTURE_LABELS[chorus.numpy()[0]]}|><|{batch['text'][0]}|><|{batch['time_end'].numpy()[0]}|>"
+            chorus = chorus.to(device)
+            model_input = {"text": text, "audio_token": token, "audio_token_len": token_len,
+                                "text_token": text_token, "text_token_len": text_token_len,
+                                "embeddings": [time_start, time_end, chorus], "raw_text":text}
+            music_audios = []
+            for model_output in model.inference(**model_input):
+                music_audios.append(model_output['music_audio'])
+            music_key = utts[0]
+            music_fn = os.path.join(args.result_dir, '{}.wav'.format(music_key))
+            torchaudio.save(music_fn, music_audios[0], sample_rate=args.sample_rate)
+            f.write('{} {}\n'.format(music_key, music_fn))
+            f.flush()
+    f.close()
+    logging.info('Result wav.scp saved in {}'.format(fn))
+if __name__ == '__main__':
+    main()

inspiremusic/bin/inference.py ADDED Viewed

	@@ -0,0 +1,266 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import os
+import torch
+from torch.utils.data import DataLoader
+import torchaudio
+from hyperpyyaml import load_hyperpyyaml
+from tqdm import tqdm
+from inspiremusic.cli.model import InspireMusicModel
+from inspiremusic.dataset.dataset import Dataset
+import time
+from inspiremusic.utils.audio_utils import trim_audio, fade_out, process_audio
+from inspiremusic.utils.common import MUSIC_STRUCTURE_LABELS
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+def get_args():
+    parser = argparse.ArgumentParser(description='inference only with your model')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--prompt_data', required=True, help='prompt data file')
+    parser.add_argument('--flow_model', default=None, required=False, help='flow model file')
+    parser.add_argument('--llm_model', default=None,required=False, help='flow model file')
+    parser.add_argument('--music_tokenizer', required=True, help='music tokenizer model file')
+    parser.add_argument('--wavtokenizer', required=True, help='wavtokenizer model file')
+    parser.add_argument('--chorus', default="random",required=False, help='chorus tag generation mode, eg. random, verse, chorus, intro.')
+    parser.add_argument('--fast', action='store_true', required=False, help='True: fast inference mode, without flow matching for fast inference. False: normal inference mode, with flow matching for high quality.')
+    parser.add_argument('--fp16', default=True, type=bool, required=False, help='inference with fp16 model')
+    parser.add_argument('--fade_out', default=True, type=bool, required=False, help='add fade out effect to generated audio')
+    parser.add_argument('--fade_out_duration', default=1.0, type=float, required=False, help='fade out duration in seconds')
+    parser.add_argument('--trim', default=False, type=bool, required=False, help='trim the silence ending of generated audio')
+    parser.add_argument('--format', type=str, default="wav", required=False,
+                        choices=["wav", "mp3", "m4a", "flac"],
+                        help='sampling rate of input audio')
+    parser.add_argument('--sample_rate', type=int, default=24000, required=False,
+                        help='sampling rate of input audio')
+    parser.add_argument('--output_sample_rate', type=int, default=48000, required=False, choices=[24000, 48000],
+                        help='sampling rate of generated output audio')
+    parser.add_argument('--min_generate_audio_seconds', type=float, default=10.0, required=False,
+                        help='the minimum generated audio length in seconds')
+    parser.add_argument('--max_generate_audio_seconds', type=float, default=30.0, required=False,
+                        help='the maximum generated audio length in seconds')
+    parser.add_argument('--gpu',
+                        type=int,
+                        default=0,
+                        help='gpu id for this rank, -1 for cpu')
+    parser.add_argument('--task',
+                        default='text-to-music',
+                        choices=['text-to-music', 'continuation', "reconstruct", "super_resolution"],
+                        help='choose inference task type. text-to-music: text-to-music task. continuation: music continuation task. reconstruct: reconstruction of original music. super_resolution: convert original 24kHz music into 48kHz music.')
+    parser.add_argument('--result_dir', required=True, help='asr result file')
+    args = parser.parse_args()
+    print(args)
+    return args
+def main():
+	args = get_args()
+	logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s')
+	os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
+	if args.fast:
+		args.output_sample_rate = 24000
+	min_generate_audio_length = int(args.output_sample_rate * args.min_generate_audio_seconds)
+	max_generate_audio_length = int(args.output_sample_rate * args.max_generate_audio_seconds)
+	assert args.min_generate_audio_seconds <= args.max_generate_audio_seconds
+	# Init inspiremusic models from configs
+	use_cuda = args.gpu >= 0 and torch.cuda.is_available()
+	device = torch.device('cuda' if use_cuda else 'cpu')
+	with open(args.config, 'r') as f:
+		configs = load_hyperpyyaml(f)
+	model = InspireMusicModel(configs['llm'], configs['flow'], configs['hift'], configs['wavtokenizer'], args.fast, args.fp16)
+	model.load(args.llm_model, args.flow_model, args.music_tokenizer, args.wavtokenizer)
+	if args.llm_model is None:
+		model.llm = None
+	else:
+		model.llm = model.llm.to(torch.float32)
+	if args.flow_model is None:
+		model.flow = None
+	test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=True, partition=False)
+	test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0)
+	del configs
+	os.makedirs(args.result_dir, exist_ok=True)
+	fn = os.path.join(args.result_dir, 'wav.scp')
+	f = open(fn, 'w')
+	caption_fn = os.path.join(args.result_dir, 'captions.txt')
+	caption_f = open(caption_fn, 'w')
+	with torch.no_grad():
+		for _, batch in tqdm(enumerate(test_data_loader)):
+			utts = batch["utts"]
+			assert len(utts) == 1, "inference mode only support batchsize 1"
+			text_token = batch["text_token"].to(device)
+			text_token_len = batch["text_token_len"].to(device)
+			if "time_start" not in batch.keys():
+				batch["time_start"] = torch.randint(0, args.min_generate_audio_seconds, (1,)).to(torch.float64)
+			if batch["time_start"].numpy()[0] > 300:
+				batch["time_start"] = torch.Tensor([0]).to(torch.float64)
+			if "time_end" not in batch.keys():
+				batch["time_end"] = torch.randint(int(batch["time_start"].numpy()[0] + args.min_generate_audio_seconds), int(batch["time_start"].numpy()[0] + args.max_generate_audio_seconds), (1,)).to(torch.float64)
+			else:
+				if (batch["time_end"].numpy()[0] - batch["time_start"].numpy()[0]) < args.min_generate_audio_seconds:
+					batch["time_end"] = torch.randint(int(batch["time_start"].numpy()[0] + args.min_generate_audio_seconds), int(batch["time_start"].numpy()[0] + args.max_generate_audio_seconds), (1,)).to(torch.float64)
+				elif (batch["time_end"].numpy()[0] - batch["time_start"].numpy()[0]) > args.max_generate_audio_seconds:
+					batch["time_end"] = torch.Tensor([(batch["time_start"].numpy()[0] + args.max_generate_audio_seconds)]).to(torch.float64)
+			if "chorus" not in batch.keys():
+				batch["chorus"] = torch.randint(1, 5, (1,))
+			if args.chorus == "random":
+				batch["chorus"] = torch.randint(1, 5, (1,))
+			elif args.chorus == "intro":
+				batch["chorus"] = torch.Tensor([0])
+			elif "verse" in args.chorus:
+				batch["chorus"] = torch.Tensor([1])
+			elif args.chorus == "chorus":
+				batch["chorus"] = torch.Tensor([2])
+			elif args.chorus == "outro":
+				batch["chorus"] = torch.Tensor([4])
+			else:
+				batch["chorus"] = batch["chorus"]
+			time_start = batch["time_start"].to(device)
+			time_end = batch["time_end"].to(device)
+			chorus = batch["chorus"].to(torch.int)
+			text_prompt = f"<|{batch['time_start'].numpy()[0]}|><|{MUSIC_STRUCTURE_LABELS[chorus.numpy()[0]]}|><|{batch['text'][0]}|><|{batch['time_end'].numpy()[0]}|>"
+			chorus = chorus.to(device)
+			if batch["acoustic_token"] is None:
+				audio_token = None
+				audio_token_len = None
+			else:
+				audio_token = batch["acoustic_token"].to(device)
+				audio_token_len = batch["acoustic_token_len"].to(device)
+			text = batch["text"]
+			if "semantic_token" in batch:
+				token = batch["semantic_token"].to(device)
+				token_len = batch["semantic_token_len"].to(device)
+			else:
+				if audio_token is None:
+					token = None
+					token_len = None
+				else:
+					token = audio_token.view(audio_token.size(0), -1, 4)[:, :, 0]
+					token_len = audio_token_len / 4
+			if args.task in ['text-to-music', 'continuation']:
+				# text to music, music continuation
+				model_input = {"text": text, "audio_token": token,
+							   "audio_token_len": token_len,
+							   "text_token": text_token,
+							   "text_token_len": text_token_len,
+							   "embeddings": [time_start, time_end, chorus],
+							   "raw_text": text,
+							   "sample_rate": args.output_sample_rate,
+							   "duration_to_gen": args.max_generate_audio_seconds,
+							   "task": args.task}
+			elif args.task in ['reconstruct', 'super_resolution']:
+				# audio reconstruction, audio super resolution
+				model_input = {"text": text, "audio_token": audio_token,
+							   "audio_token_len": audio_token_len,
+							   "text_token": text_token,
+							   "text_token_len": text_token_len,
+							   "embeddings": [time_start, time_end, chorus],
+							   "raw_text": text,
+							   "sample_rate": args.output_sample_rate,
+							   "duration_to_gen": args.max_generate_audio_seconds,
+							   "task": args.task}
+			else:
+				# zero-shot
+				model_input = {'text'                       : text,
+							   'text_len'                   : text_token_len,
+							   'prompt_text'                : text_token,
+							   'prompt_text_len'            : text_token_len,
+							   'llm_prompt_audio_token'     : token,
+							   'llm_prompt_audio_token_len' : token_len,
+							   'flow_prompt_audio_token'    : audio_token,
+							   'flow_prompt_audio_token_len': audio_token_len,
+							   'prompt_audio_feat'          : audio_feat,
+							   'prompt_audio_feat_len'      : audio_feat_len,
+							   "embeddings"                 : [time_start,
+															   time_end,
+															   chorus]}
+			music_key = utts[0]
+			music_audios = []
+			music_fn = os.path.join(args.result_dir, f'{music_key}.{args.format}')
+			bench_start = time.time()
+			for model_output in model.inference(**model_input):
+				music_audios.append(model_output['music_audio'])
+			bench_end = time.time()
+			if args.trim:
+				music_audio = trim_audio(music_audios[0],
+										 sample_rate=args.output_sample_rate,
+										 threshold=0.05,
+										 min_silence_duration=0.8)
+			else:
+				music_audio = music_audios[0]
+			if music_audio.shape[0] != 0:
+				if music_audio.shape[1] > max_generate_audio_length:
+					music_audio = music_audio[:, :max_generate_audio_length]
+				if music_audio.shape[1] >= min_generate_audio_length:
+					try:
+						if args.fade_out:
+							music_audio = fade_out(music_audio, args.output_sample_rate, args.fade_out_duration)
+						music_audio = music_audio.repeat(2, 1)
+						if args.format in ["wav", "flac"]:
+							torchaudio.save(music_fn, music_audio, sample_rate=args.output_sample_rate, encoding="PCM_S", bits_per_sample=24)
+						elif args.format in ["mp3", "m4a"]:
+							torchaudio.backend.sox_io_backend.save(filepath=music_fn, src=music_audio, sample_rate=args.output_sample_rate, format=args.format)
+						else:
+							logging.info(f"Format is not supported. Please choose from wav, mp3, m4a, flac.")
+					except Exception as e:
+						logging.info(f"Error saving file: {e}")
+						raise
+					audio_duration = music_audio.shape[1] / args.output_sample_rate
+					rtf = (bench_end - bench_start) / audio_duration
+					logging.info(f"processing time: {int(bench_end - bench_start)}s, audio length: {int(audio_duration)}s, rtf: {rtf}, text prompt: {text_prompt}")
+					f.write('{} {}\n'.format(music_key, music_fn))
+					f.flush()
+					caption_f.write('{}\t{}\n'.format(music_key, text_prompt))
+					caption_f.flush()
+				else:
+					logging.info(f"Generate audio length {music_audio.shape[1]} is shorter than min_generate_audio_length.")
+			else:
+				logging.info(f"Generate audio is empty, dim = {music_audio.shape[0]}.")
+	f.close()
+	logging.info('Result wav.scp saved in {}'.format(fn))
+if __name__ == '__main__':
+	main()

inspiremusic/bin/train.py ADDED Viewed

	@@ -0,0 +1,194 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import datetime
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+from copy import deepcopy
+import torch
+import torch.distributed as dist
+import deepspeed
+import glob
+import os
+from hyperpyyaml import load_hyperpyyaml
+from torch.cuda.amp import GradScaler, autocast
+from torch.distributed.elastic.multiprocessing.errors import record
+from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
+from inspiremusic.utils.executor import Executor
+from inspiremusic.utils.train_utils import (
+    init_distributed,
+    init_dataset_and_dataloader,
+    init_optimizer_and_scheduler,
+    init_summarywriter, save_model,
+    wrap_cuda_model, check_modify_and_save_config)
+def get_args():
+    parser = argparse.ArgumentParser(description='training your network')
+    parser.add_argument('--train_engine',
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'deepspeed'],
+                        help='Engine for paralleled training')
+    parser.add_argument('--model', required=True, help='model which will be trained')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--train_data', required=True, help='train data file')
+    parser.add_argument('--cv_data', required=True, help='cv data file')
+    parser.add_argument('--checkpoint', help='checkpoint model')
+    parser.add_argument('--model_dir', required=True, help='save model dir')
+    parser.add_argument('--tensorboard_dir',
+                        default='tensorboard',
+                        help='tensorboard log dir')
+    parser.add_argument('--ddp.dist_backend',
+                        dest='dist_backend',
+                        default='nccl',
+                        choices=['nccl', 'gloo'],
+                        help='distributed backend')
+    parser.add_argument('--num_workers',
+                        default=0,
+                        type=int,
+                        help='number of subprocess workers for reading')
+    parser.add_argument('--prefetch',
+                        default=100,
+                        type=int,
+                        help='prefetch number')
+    parser.add_argument('--pin_memory',
+                        action='store_true',
+                        default=True,
+                        help='Use pinned memory buffers used for reading')
+    parser.add_argument('--deepspeed.save_states',
+                        dest='save_states',
+                        default='model_only',
+                        choices=['model_only', 'model+optimizer'],
+                        help='save model/optimizer states')
+    parser.add_argument('--timeout',
+                        default=30,
+                        type=int,
+                        help='timeout (in seconds) of inspiremusic_join.')
+    parser.add_argument('--fp16',
+                          action='store_true',
+                          default=False,
+                          help='Enable fp16 mixed precision training')
+    parser.add_argument('--lora',
+                          action='store_true',
+                          default=False,
+                          help='Enable LoRA training')
+    parser.add_argument('--lora_rank',
+                          default=4,
+                          type=int,
+                          help='LoRA rank')
+    parser.add_argument('--lora_alpha',
+                          default=16,
+                          type=int,
+                          help='LoRA alpha')
+    parser.add_argument('--lora_dropout',
+                          default=0.1,
+                          type=float,
+                          help='LoRA dropout rate')
+    parser.add_argument('--lora_target_modules',
+                          nargs='+',
+                          default=["k_proj","v_proj"],
+                          help='Target modules to apply LoRA (e.g., ["q_proj", "v_proj"])')
+    parser = deepspeed.add_config_arguments(parser)
+    args = parser.parse_args()
+    return args
+@record
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    override_dict = {k: None for k in ['llm', 'flow', 'hift'] if k != args.model}
+    with open(args.config, 'r') as f:
+        configs = load_hyperpyyaml(f, overrides=override_dict)
+    configs['train_conf'].update(vars(args))
+    # Init env for ddp
+    init_distributed(args)
+    # Get dataset & dataloader
+    train_dataset, cv_dataset, train_data_loader, cv_data_loader = \
+        init_dataset_and_dataloader(args, configs)
+    # Do some sanity checks and save config to arsg.model_dir
+    configs = check_modify_and_save_config(args, configs)
+    # Tensorboard summary
+    writer = init_summarywriter(args)
+    # load checkpoint
+    model = configs[args.model]
+    if args.checkpoint is not None:
+        model.load_state_dict(torch.load(args.checkpoint, map_location='cpu'))
+    else:
+        # Find and load the latest checkpoint
+        checkpoint_files = glob.glob(os.path.join(args.model_dir, '*.pt'))
+        if checkpoint_files:
+            latest_checkpoint = max(checkpoint_files, key=os.path.getctime)
+            logging.info(f"Loaded latest checkpoint from {latest_checkpoint}")
+            model.load_state_dict(torch.load(latest_checkpoint, map_location='cpu'))
+    if args.lora:
+        logging.info("Applying LoRA to the model...")
+        if not args.lora_target_modules:
+            raise ValueError("No target modules specified for LoRA. Please provide --lora_target_modules.")
+        lora_config = LoraConfig(
+            task_type="CAUSAL_LM",  # Change to appropriate task type
+            inference_mode=False,
+            r=args.lora_rank,
+            lora_alpha=args.lora_alpha,
+            lora_dropout=args.lora_dropout,
+            target_modules=args.lora_target_modules
+        )
+        model.llm.model = get_peft_model(model.llm.model, lora_config)
+        # Optionally freeze the base model
+    else:
+        logging.info("LoRA is not enabled. Training the full model.")
+    # Dispatch model from cpu to gpu
+    model = wrap_cuda_model(args, model)
+    # Get optimizer & scheduler
+    model, optimizer, scheduler = init_optimizer_and_scheduler(args, configs, model)
+    # Initialize AMP for torch_ddp if fp16 is enabled
+    scaler = None
+    if args.fp16:
+        scaler = GradScaler()
+        logging.info("Initialized AMP GradScaler for mixed precision training.")
+    # Save init checkpoints
+    info_dict = deepcopy(configs['train_conf'])
+    # Get executor
+    executor = Executor()
+    # Start training loop
+    for epoch in range(info_dict['max_epoch']):
+        executor.epoch = epoch
+        train_dataset.set_epoch(epoch)
+        dist.barrier()
+        group_join = dist.new_group(backend="gloo", timeout=datetime.timedelta(seconds=args.timeout))
+        executor.train_one_epoch(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join, scaler=scaler)
+        dist.destroy_process_group(group_join)
+if __name__ == '__main__':
+    main()

inspiremusic/cli/__init__.py ADDED Viewed

File without changes

inspiremusic/cli/frontend.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+import torch
+from typing import Callable
+import re
+import inflect
+from inspiremusic.cli.model import InspireMusicModel
+from inspiremusic.utils.frontend_utils import contains_chinese, replace_blank, replace_corner_mark, remove_bracket, spell_out_number, split_paragraph
+from inspiremusic.wavtokenizer.decoder.pretrained import WavTokenizer
+class InspireMusicFrontEnd:
+    def __init__(self,
+                 configs: Callable,
+                 get_tokenizer: Callable,
+                 llm_model: str,
+                 flow_model: str,
+                 music_tokenizer_dir: str,
+                 audio_tokenizer_dir: str,
+                 instruct: bool = False,
+                 fast: bool = False,
+                 fp16: bool = True,
+                 allowed_special: str = 'all'):
+        self.tokenizer = get_tokenizer()
+        self.audio_tokenizer_dir = audio_tokenizer_dir
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.bandwidth_id = torch.tensor([0]).to(self.device)
+        self.wavtokenizer = WavTokenizer.from_pretrained_feat(f"{audio_tokenizer_dir}/config.yaml", f"{audio_tokenizer_dir}/model.pt").to(self.device)
+        self.model = InspireMusicModel(configs['llm'], configs['flow'], configs['hift'], configs['wavtokenizer'], fast, fp16)
+        self.model = self.model.load(llm_model, flow_model, music_tokenizer_dir, audio_tokenizer_dir)
+        self.instruct = instruct
+        self.allowed_special = allowed_special
+        self.inflect_parser = inflect.engine()
+    def _extract_text_token(self, text):
+        text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
+        text_token = torch.tensor([text_token], dtype=torch.int32).to(self.device)
+        text_token_len = torch.tensor([text_token.shape[1]], dtype=torch.int32).to(self.device)
+        return text_token, text_token_len
+    def _extract_audio_token(self, audio, sample_rate=24000):
+        audio = torch.tensor(audio, dtype=torch.float32, device=self.device)
+        _, audio_token = self.wavtokenizer.encode_infer(audio, bandwidth_id=self.bandwidth_id)
+        audio_token = audio_token.squeeze(0)
+        audio_token_len = torch.tensor([audio_token.shape[1]], dtype=torch.int32, device=self.device)
+        return audio_token, audio_token_len
+    def text_normalize(self, text, split=True):
+        text = text.strip()
+        if contains_chinese(text):
+            text = text.replace("\n", "")
+            text = replace_blank(text)
+            text = replace_corner_mark(text)
+            text = text.replace(".", "、")
+            text = text.replace(" - ", "，")
+            text = remove_bracket(text)
+            text = re.sub(r'[，,]+$', '。', text)
+            texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80,
+                                         token_min_n=60, merge_len=20, comma_split=False))
+        else:
+            text = spell_out_number(text, self.inflect_parser)
+            texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80,
+                                         token_min_n=60, merge_len=20, comma_split=False))
+        if split is False:
+            return text
+        return texts
+    def frontend_text_to_music(self, text, time_start, time_end, chorus):
+        text_token, text_token_len = self._extract_text_token(text)
+        model_input = {"text": text, "audio_token": None, "audio_token_len": None,
+                                "text_token": text_token, "text_token_len": text_token_len,
+                                "embeddings": [time_start, time_end, chorus], "raw_text":text}
+        return model_input
+    def frontend_continuation(self, text, audio, time_start, time_end, chorus, target_sr=24000):
+        if text is None:
+            text_token = None
+            text_token_len = None
+        else:
+            text_token, text_token_len = self._extract_text_token(text)
+        audio_token, audio_token_len = self._extract_audio_token(audio, target_sr)
+        model_input = {"text": text, "audio_token": audio_token, "audio_token_len": audio_token_len,
+                                "text_token": text_token, "text_token_len": text_token_len,
+                                "embeddings": [time_start, time_end, chorus], "raw_text":text}
+        return model_input

inspiremusic/cli/inference.py ADDED Viewed

	@@ -0,0 +1,312 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+import torchaudio
+import time
+import logging
+import argparse
+from inspiremusic.cli.inspiremusic import InspireMusic
+from inspiremusic.utils.file_utils import logging
+import torch
+from inspiremusic.utils.audio_utils import trim_audio, fade_out, process_audio
+def set_env_variables():
+    os.environ['PYTHONIOENCODING'] = 'UTF-8'
+    os.environ['TOKENIZERS_PARALLELISM'] = 'False'
+    main_root = os.getcwd()
+    bin_dir = os.path.join(main_root, 'inspiremusic')
+    third_party_matcha_tts_path = os.path.join(main_root, 'third_party', 'Matcha-TTS')
+    python_path = f"{main_root}:{bin_dir}:{third_party_matcha_tts_path}:{os.environ.get('PYTHONPATH', '')}"
+    os.environ['PATH'] = python_path
+    sys.path.extend([main_root, third_party_matcha_tts_path])
+class InspireMusicUnified:
+    def __init__(self,
+                 model_name: str = "InspireMusic-1.5B-Long",
+                 model_dir: str = None,
+                 min_generate_audio_seconds: float = 10.0,
+                 max_generate_audio_seconds: float = 30.0,
+                 sample_rate: int = 24000,
+                 output_sample_rate: int = 48000,
+                 load_jit: bool = True,
+                 load_onnx: bool = False,
+                 fast: bool = False,
+                 fp16: bool = True,
+                 gpu: int = 0,
+                 result_dir: str = None,
+                 hub="modelscope"):
+        os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu)
+        # Set model_dir or default to downloading if it doesn't exist
+        if model_dir is None:
+             model_dir = f"pretrained_models/{model_name}"
+        else:
+            model_dir = model_dir.replace("../../", "./")
+        if not os.path.isfile(f"{model_dir}/llm.pt"):
+            if hub == "modelscope":
+                from modelscope import snapshot_download
+                if model_name == "InspireMusic-Base":
+                    snapshot_download(f"iic/InspireMusic", local_dir=model_dir)
+                else:
+                    snapshot_download(f"iic/{model_name}", local_dir=model_dir)
+        self.model_dir = model_dir
+        print(self.model_dir)
+        self.sample_rate = sample_rate
+        self.output_sample_rate = 24000 if fast else output_sample_rate
+        self.result_dir = result_dir or f"exp/{model_name}"
+        os.makedirs(self.result_dir, exist_ok=True)
+        self.min_generate_audio_seconds = min_generate_audio_seconds
+        self.max_generate_audio_seconds = max_generate_audio_seconds
+        self.min_generate_audio_length = int(self.output_sample_rate * self.min_generate_audio_seconds)
+        self.max_generate_audio_length = int(self.output_sample_rate * self.max_generate_audio_seconds)
+        assert self.min_generate_audio_seconds <= self.max_generate_audio_seconds, "Min audio seconds must be less than or equal to max audio seconds"
+        use_cuda = gpu >= 0 and torch.cuda.is_available()
+        self.device = torch.device('cuda' if use_cuda else 'cpu')
+        self.model = InspireMusic(self.model_dir, load_jit=load_jit, load_onnx=load_onnx, fast=fast, fp16=fp16)
+        self.model.model.llm = self.model.model.llm.to(torch.float16)
+        logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+    @torch.inference_mode()
+    def inference(self,
+                  task: str = 'text-to-music',
+                  text: str = None,
+                  audio_prompt: str = None, # audio prompt file path
+                  chorus: str = "verse",
+                  time_start: float = 0.0,
+                  time_end: float = 30.0,
+                  output_fn: str = "output_audio",
+                  max_audio_prompt_length: float = 5.0,
+                  fade_out_duration: float = 1.0,
+                  output_format: str = "wav",
+                  fade_out_mode: bool = True,
+                  trim: bool = False,
+                  ):
+        with torch.no_grad():
+            text_prompt = f"<|{time_start}|><|{chorus}|><|{text}|><|{time_end}|>"
+            chorus_dict = {"random": torch.randint(1, 5, (1,)).item(), "intro" : 0, "verse": 1, "chorus": 2, "outro": 4}
+            chorus = chorus_dict.get(chorus, 1)
+            chorus = torch.tensor([chorus], dtype=torch.int).to(self.device)
+            time_start_tensor = torch.tensor([time_start], dtype=torch.float64).to(self.device)
+            time_end_tensor = torch.tensor([time_end], dtype=torch.float64).to(self.device)
+            music_fn = os.path.join(self.result_dir, f'{output_fn}.{output_format}')
+            bench_start = time.time()
+            if task == 'text-to-music':
+                model_input = {
+                    "text"           : text,
+                    "audio_prompt"   : audio_prompt,
+                    "time_start"     : time_start_tensor,
+                    "time_end"       : time_end_tensor,
+                    "chorus"         : chorus,
+                    "task"           : task,
+                    "stream"         : False,
+                    "duration_to_gen": self.max_generate_audio_seconds,
+                    "sr"             : self.sample_rate
+                }
+            elif task == 'continuation':
+                if audio_prompt is not None:
+                    audio, _ = process_audio(audio_prompt, self.sample_rate)
+                    if audio.size(1) < self.sample_rate:
+                        logging.warning("Warning: Input prompt audio length is shorter than 1s. Please provide an appropriate length audio prompt and try again.")
+                        audio = None
+                    else:
+                        max_audio_prompt_length_samples = int(max_audio_prompt_length * self.sample_rate)
+                        audio = audio[:, :max_audio_prompt_length_samples]  # Trimming prompt audio
+                model_input = {
+                    "text"           : text,
+                    "audio_prompt"   : audio,
+                    "time_start"     : time_start_tensor,
+                    "time_end"       : time_end_tensor,
+                    "chorus"         : chorus,
+                    "task"           : task,
+                    "stream"         : False,
+                    "duration_to_gen": self.max_generate_audio_seconds,
+                    "sr"             : self.sample_rate
+                }
+            music_audios = []
+            for model_output in self.model.cli_inference(**model_input):
+                music_audios.append(model_output['music_audio'])
+            bench_end = time.time()
+            if trim:
+                music_audio = trim_audio(music_audios[0],
+                                         sample_rate=self.output_sample_rate,
+                                         threshold=0.05,
+                                         min_silence_duration=0.8)
+            else:
+                music_audio = music_audios[0]
+            if music_audio.shape[0] != 0:
+                if music_audio.shape[1] > self.max_generate_audio_length:
+                    music_audio = music_audio[:, :self.max_generate_audio_length]
+                if music_audio.shape[1] >= self.min_generate_audio_length:
+                    try:
+                        if fade_out_mode:
+                            music_audio = fade_out(music_audio, self.output_sample_rate, fade_out_duration)
+                        music_audio = music_audio.repeat(2, 1)
+                        if output_format in ["wav", "flac"]:
+                            torchaudio.save(music_fn, music_audio,
+                                            sample_rate=self.output_sample_rate,
+                                            encoding="PCM_S",
+                                            bits_per_sample=24)
+                        elif output_format in ["mp3", "m4a"]:
+                            torchaudio.backend.sox_io_backend.save(
+                                filepath=music_fn, src=music_audio,
+                                sample_rate=self.output_sample_rate,
+                                format=output_format)
+                        else:
+                            logging.info("Format is not supported. Please choose from wav, mp3, m4a, flac.")
+                    except Exception as e:
+                        logging.error(f"Error saving file: {e}")
+                        raise
+                audio_duration = music_audio.shape[1] / self.output_sample_rate
+                rtf = (bench_end - bench_start) / audio_duration
+                logging.info(f"Processing time: {int(bench_end - bench_start)}s, audio length: {int(audio_duration)}s, rtf: {rtf}, text prompt: {text_prompt}")
+            else:
+                logging.error(f"Generated audio length is shorter than minimum required audio length.")
+        if music_fn:
+            if os.path.exists(music_fn):
+                logging.info(f"Generated audio file {music_fn} is saved.")
+                return music_fn
+            else:
+                logging.error(f"{music_fn} does not exist.")
+def get_args():
+    parser = argparse.ArgumentParser(description='Run inference with your model')
+    parser.add_argument('-m', '--model_name', default="InspireMusic-1.5B-Long",
+                        help='Model name')
+    parser.add_argument('-d', '--model_dir',
+                        help='Model folder path')
+    parser.add_argument('-t', '--text', default="Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance.",
+                        help='Prompt text')
+    parser.add_argument('-a', '--audio_prompt', default=None,
+                        help='Prompt audio')
+    parser.add_argument('-c', '--chorus', default="intro",
+                        help='Chorus tag generation mode (e.g., random, verse, chorus, intro, outro)')
+    parser.add_argument('-f', '--fast', type=bool, default=False,
+                        help='Enable fast inference mode (without flow matching)')
+    parser.add_argument('-g', '--gpu', type=int, default=0,
+                        help='GPU ID for this rank, -1 for CPU')
+    parser.add_argument('--task', default='text-to-music', choices=['text-to-music', 'continuation', 'reconstruct', 'super_resolution'],
+                        help='Inference task type: text-to-music, continuation, reconstruct, super_resolution')
+    parser.add_argument('-r', '--result_dir', default="exp/inspiremusic",
+                        help='Directory to save generated audio')
+    parser.add_argument('-o', '--output_fn', default="output_audio",
+                        help='Output file name')
+    parser.add_argument('--format', type=str, default="wav", choices=["wav", "mp3", "m4a", "flac"],
+                        help='Format of output audio')
+    parser.add_argument('--sample_rate', type=int, default=24000,
+                        help='Sampling rate of input audio')
+    parser.add_argument('--output_sample_rate', type=int, default=48000, choices=[24000, 48000],
+                        help='Sampling rate of generated output audio')
+    parser.add_argument('-s', '--time_start', type=float, default=0.0,
+                        help='Start time in seconds')
+    parser.add_argument('-e', '--time_end', type=float, default=30.0,
+                        help='End time in seconds')
+    parser.add_argument('--max_audio_prompt_length', type=float, default=5.0,
+                        help='Maximum audio prompt length in seconds')
+    parser.add_argument('--min_generate_audio_seconds', type=float, default=10.0,
+                        help='Minimum generated audio length in seconds')
+    parser.add_argument('--max_generate_audio_seconds', type=float, default=300.0,
+                        help='Maximum generated audio length in seconds')
+    parser.add_argument('--fp16', type=bool, default=True,
+                        help='Inference with fp16 model')
+    parser.add_argument('--fade_out', type=bool, default=True,
+                        help='Apply fade out effect to generated audio')
+    parser.add_argument('--fade_out_duration', type=float, default=1.0,
+                        help='Fade out duration in seconds')
+    parser.add_argument('--trim', type=bool, default=False,
+                        help='Trim the silence ending of generated audio')
+    args = parser.parse_args()
+    if not args.model_dir:
+        args.model_dir = os.path.join("pretrained_models", args.model_name)
+    print(args)
+    return args
+def main():
+    set_env_variables()
+    args = get_args()
+    model = InspireMusicUnified(model_name = args.model_name,
+                 model_dir = args.model_dir,
+                 min_generate_audio_seconds = args.min_generate_audio_seconds,
+                 max_generate_audio_seconds = args.max_generate_audio_seconds,
+                 sample_rate = args.sample_rate,
+                 output_sample_rate = args.output_sample_rate,
+                 load_jit = True,
+                 load_onnx = False,
+                 fast = args.fast,
+                 fp16 = args.fp16,
+                 gpu = args.gpu,
+                 result_dir = args.result_dir)
+    model.inference(task = args.task,
+                text = args.text,
+                audio_prompt = args.audio_prompt,
+                chorus = args.chorus,
+                time_start = args.time_start,
+                time_end = args.time_end,
+                output_fn = args.output_fn,
+                max_audio_prompt_length = args.max_audio_prompt_length,
+                fade_out_duration = args.fade_out_duration,
+                output_format = args.format,
+                fade_out_mode = args.fade_out,
+                trim = args.trim)
+if __name__ == "__main__":
+    main()

inspiremusic/cli/inspiremusic.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import time
+from tqdm import tqdm
+from hyperpyyaml import load_hyperpyyaml
+from inspiremusic.cli.frontend import InspireMusicFrontEnd
+from inspiremusic.cli.model import InspireMusicModel
+from inspiremusic.utils.file_utils import logging
+import torch
+class InspireMusic:
+    def __init__(self, model_dir, load_jit=True, load_onnx=False, fast = False, fp16=True, hub="modelscope"):
+        instruct = True if '-Instruct' in model_dir else False
+        if model_dir is None:
+             model_dir = f"pretrained_models/InspireMusic-1.5B-Long"
+        if not os.path.isfile(f"{model_dir}/llm.pt"):
+            model_name = model_dir.split("/")[-1]
+            if hub == "modelscope":
+                from modelscope import snapshot_download
+                if model_name == "InspireMusic-Base":
+                    snapshot_download(f"iic/InspireMusic", local_dir=model_dir)
+                else:
+                    snapshot_download(f"iic/{model_name}", local_dir=model_dir)
+        assert os.path.exists(f'{model_dir}/inspiremusic.yaml')
+        with open('{}/inspiremusic.yaml'.format(model_dir), 'r') as f:
+            configs = load_hyperpyyaml(f)
+        self.frontend = InspireMusicFrontEnd(configs,
+                                          configs['get_tokenizer'],
+                                          '{}/llm.pt'.format(model_dir),
+                                          '{}/flow.pt'.format(model_dir),
+                                          '{}/music_tokenizer/'.format(model_dir),
+                                          '{}/wavtokenizer/'.format(model_dir),
+                                          instruct,
+                                          fast,
+                                          fp16,
+                                          configs['allowed_special'])
+        self.model = InspireMusicModel(configs['llm'], configs['flow'], configs['hift'], configs['wavtokenizer'], fast, fp16)
+        self.model.load('{}/llm.pt'.format(model_dir),
+                        '{}/flow.pt'.format(model_dir),
+                        '{}/music_tokenizer/'.format(model_dir),
+                        '{}/wavtokenizer/model.pt'.format(model_dir))
+        del configs
+    @torch.inference_mode()
+    def inference(self, task, text, audio, time_start, time_end, chorus, stream=False, sr=24000):
+        if task == "text-to-music":
+            for i in tqdm(self.frontend.text_normalize(text, split=True)):
+                model_input = self.frontend.frontend_text_to_music(i, time_start, time_end, chorus)
+                start_time = time.time()
+                logging.info('prompt text {}'.format(i))
+                for model_output in self.model.inference(**model_input, stream=stream):
+                    music_audios_len = model_output['music_audio'].shape[1] / sr
+                    logging.info('yield music len {}, rtf {}'.format(music_audios_len, (time.time() - start_time) / music_audios_len))
+                    yield model_output
+                    start_time = time.time()
+        elif task == "continuation":
+            if text is None:
+                if audio is not None:
+                    for i in tqdm(audio):
+                        model_input = self.frontend.frontend_continuation(None, i, time_start, time_end, chorus, sr, max_audio_length)
+                        start_time = time.time()
+                        logging.info('prompt text {}'.format(i))
+                        for model_output in self.model.continuation_inference(**model_input, stream=stream):
+                            music_audios_len = model_output['music_audio'].shape[1] / sr
+                            logging.info('yield music len {}, rtf {}'.format(music_audios_len, (time.time() - start_time) / music_audios_len))
+                            yield model_output
+                            start_time = time.time()
+            else:
+                if audio is not None:
+                    for i in tqdm(self.frontend.text_normalize(text, split=True)):
+                        model_input = self.frontend.frontend_continuation(i, audio, time_start, time_end, chorus, sr, max_audio_length)
+                        start_time = time.time()
+                        logging.info('prompt text {}'.format(i))
+                        for model_output in self.model.continuation_inference(**model_input, stream=stream):
+                            music_audios_len = model_output['music_audio'].shape[1] / sr
+                            logging.info('yield music len {}, rtf {}'.format(music_audios_len, (time.time() - start_time) / music_audios_len))
+                            yield model_output
+                            start_time = time.time()
+                else:
+                    print("Please input text or audio.")
+        else:
+            print("Currently only support text-to-music and music continuation tasks.")
+    @torch.inference_mode()
+    def cli_inference(self, text, audio_prompt, time_start, time_end, chorus, task, stream=False, duration_to_gen=30, sr=24000):
+        if task == "text-to-music":
+            model_input = self.frontend.frontend_text_to_music(text, time_start, time_end, chorus)
+            logging.info('prompt text {}'.format(text))
+        elif task == "continuation":
+            model_input = self.frontend.frontend_continuation(text, audio_prompt, time_start, time_end, chorus, sr)
+            logging.info('prompt audio length: {}'.format(len(audio_prompt)))
+        start_time = time.time()
+        for model_output in self.model.inference(**model_input, duration_to_gen=duration_to_gen, task=task):
+            music_audios_len = model_output['music_audio'].shape[1] / sr
+            logging.info('yield music len {}, rtf {}'.format(music_audios_len, (time.time() - start_time) / music_audios_len))
+            yield model_output
+            start_time = time.time()
+    @torch.inference_mode()
+    def inference_zero_shot(self, text, prompt_text, prompt_audio_16k, stream=False, sr=24000):
+        prompt_text = self.frontend.text_normalize(prompt_text, split=False)
+        for i in tqdm(self.frontend.text_normalize(text, split=True)):
+            model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_audio_16k)
+            start_time = time.time()
+            logging.info('prompt text {}'.format(i))
+            for model_output in self.model.inference(**model_input, stream=stream):
+                audio_len = model_output['music_audio'].shape[1] / sr
+                logging.info('yield audio len {}, rtf {}'.format(audio_len, (time.time() - start_time) / audio_len))
+                yield model_output
+                start_time = time.time()
+    @torch.inference_mode()
+    def inference_instruct(self, text, spk_id, instruct_text, stream=False, sr=24000):
+        if self.frontend.instruct is False:
+            raise ValueError('{} do not support instruct inference'.format(self.model_dir))
+        instruct_text = self.frontend.text_normalize(instruct_text, split=False)
+        for i in tqdm(self.frontend.text_normalize(text, split=True)):
+            model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text)
+            start_time = time.time()
+            logging.info('prompt text {}'.format(i))
+            for model_output in self.model.inference(**model_input, stream=stream):
+                audio_len = model_output['music_audio'].shape[1] / sr
+                logging.info('yield audio len {}, rtf {}'.format(audio_len, (time.time() - start_time) / audio_len))
+                yield model_output
+                start_time = time.time()

inspiremusic/cli/model.py ADDED Viewed

	@@ -0,0 +1,295 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import threading
+import time
+from contextlib import nullcontext
+import uuid
+from inspiremusic.music_tokenizer.vqvae import VQVAE
+from inspiremusic.wavtokenizer.decoder.pretrained import WavTokenizer
+from torch.cuda.amp import autocast
+import logging
+import torch
+import os
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+class InspireMusicModel:
+    def __init__(self,
+                 llm: torch.nn.Module,
+                 flow: torch.nn.Module,
+                 music_tokenizer: torch.nn.Module,
+                 wavtokenizer: torch.nn.Module,
+                 fast: bool = False,
+                 fp16: bool = True,
+                 ):
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.llm = llm
+        self.flow = flow
+        self.music_tokenizer = music_tokenizer
+        self.wavtokenizer = wavtokenizer
+        self.fp16 = fp16
+        self.token_min_hop_len = 100
+        self.token_max_hop_len = 200
+        self.token_overlap_len = 20
+        # mel fade in out
+        self.mel_overlap_len = 34
+        self.mel_window = np.hamming(2 * self.mel_overlap_len)
+        # hift cache
+        self.mel_cache_len = 20
+        self.source_cache_len = int(self.mel_cache_len * 256)
+        # rtf and decoding related
+        self.stream_scale_factor = 1
+        assert self.stream_scale_factor >= 1, 'stream_scale_factor should be greater than 1, change it according to your actual rtf'
+        self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
+        self.lock = threading.Lock()
+        # dict used to store session related variable
+        self.music_token_dict = {}
+        self.llm_end_dict = {}
+        self.mel_overlap_dict = {}
+        self.fast = fast
+        self.generator = "hifi"
+    def load(self, llm_model, flow_model, hift_model, wavtokenizer_model):
+        if llm_model is not None:
+            self.llm.load_state_dict(torch.load(llm_model, map_location=self.device))
+            self.llm.to(self.device).eval()
+        else:
+            self.llm = None
+        if flow_model is not None:
+            self.flow.load_state_dict(torch.load(flow_model, map_location=self.device))
+            self.flow.to(self.device).eval()
+        if hift_model is not None:
+            if ".pt" not in hift_model:
+                self.music_tokenizer = VQVAE( hift_model + '/config.json',
+                                    hift_model + '/model.pt', with_encoder=True)
+            else:
+                self.music_tokenizer = VQVAE(os.path.dirname(hift_model) + '/config.json',
+                                    hift_model, with_encoder=True)
+            self.music_tokenizer.to(self.device).eval()
+        if wavtokenizer_model is not None:
+            if ".pt" not in wavtokenizer_model:
+                self.wavtokenizer = WavTokenizer.from_pretrained_feat( wavtokenizer_model + '/config.yaml',
+                                    wavtokenizer_model + '/model.pt')
+            else:
+                self.wavtokenizer = WavTokenizer.from_pretrained_feat( os.path.dirname(wavtokenizer_model) + '/config.yaml',
+                                    wavtokenizer_model )
+            self.wavtokenizer.to(self.device)
+    def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model):
+        assert self.fp16 is True, "we only provide fp16 jit model, set fp16=True if you want to use jit model"
+        llm_text_encoder = torch.jit.load(llm_text_encoder_model, map_location=self.device)
+        self.llm.text_encoder = llm_text_encoder
+        llm_llm = torch.jit.load(llm_llm_model)
+        self.llm.llm = llm_llm
+        flow_encoder = torch.jit.load(flow_encoder_model)
+        self.flow.encoder = flow_encoder
+    def load_onnx(self, flow_decoder_estimator_model):
+        import onnxruntime
+        option = onnxruntime.SessionOptions()
+        option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        option.intra_op_num_threads = 1
+        providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
+        del self.flow.decoder.estimator
+        self.flow.decoder.estimator = onnxruntime.InferenceSession(flow_decoder_estimator_model, sess_options=option, providers=providers)
+    def llm_job(self, text, audio_token, audio_token_len, prompt_text, llm_prompt_audio_token, embeddings, uuid, duration_to_gen, task):
+        with self.llm_context:
+            local_res = []
+            with autocast(enabled=self.fp16):
+                inference_kwargs = {
+                    'text': text.to(self.device),
+                    'text_len': torch.tensor([text.shape[1]], dtype=torch.int32).to(self.device),
+                    'prompt_text': prompt_text.to(self.device),
+                    'prompt_text_len': torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
+                    'prompt_audio_token': llm_prompt_audio_token.to(self.device),
+                    'prompt_audio_token_len': torch.tensor([llm_prompt_audio_token.shape[1]], dtype=torch.int32).to(self.device),
+                    'embeddings': embeddings,
+                    'duration_to_gen': duration_to_gen,
+                    'task': task
+                    }
+                if audio_token is not None:
+                    inference_kwargs['audio_token'] = audio_token.to(self.device)
+                else:
+                    inference_kwargs['audio_token'] = torch.Tensor([0]).to(self.device)
+                if audio_token_len is not None:
+                    inference_kwargs['audio_token_len'] = audio_token_len.to(self.device)
+                else:
+                    inference_kwargs['audio_token_len'] = torch.Tensor([0]).to(self.device)
+                for i in self.llm.inference(**inference_kwargs):
+                    local_res.append(i)
+            self.music_token_dict[uuid] = local_res
+        self.llm_end_dict[uuid] = True
+    # def token2wav(self, token, token_len, text, text_len, uuid, sample_rate, finalize=False):
+    def token2wav(self, token, token_len, uuid, sample_rate, finalize=False, flow_cfg=None):
+        # if self.flow is not None:
+        #     if isinstance(self.flow,MaskedDiffWithText):
+        #         codec_embed = self.flow.inference(token=token.to(self.device),
+        #                                         token_len=token_len.to(self.device),
+        #                                         text_token=text,
+        #                                         text_token_len=text_len,
+        #                                         )
+        #     else:
+        if flow_cfg is not None:
+            codec_embed = self.flow.inference_cfg(token=token.to(self.device),
+                                token_len=token_len.to(self.device),
+                                sample_rate=sample_rate
+                                )
+        else:
+            codec_embed = self.flow.inference(token=token.to(self.device),
+                                token_len=token_len.to(self.device),
+                                sample_rate=sample_rate
+                                )
+        # use music_tokenizer decoder
+        wav = self.music_tokenizer.generator(codec_embed)
+        wav = wav.squeeze(0).cpu().detach()
+        return wav
+    def acoustictoken2wav(self, token):
+        # use music_tokenizer to generate waveform from token
+        token = token.view(token.size(0), -1, 4)
+        # codec = token.view(1, -1, 4)
+        codec_embed = self.music_tokenizer.quantizer.embed(torch.tensor(token).long().to(self.device)).cuda()
+        wav = self.music_tokenizer.generator(codec_embed)
+        wav = wav.squeeze(0).cpu().detach()
+        return wav
+    def semantictoken2wav(self, token):
+        # fast mode, use wavtokenizer decoder
+        new_tensor = torch.tensor(token.to(self.device)).unsqueeze(0)
+        features = self.wavtokenizer.codes_to_features(new_tensor)
+        bandwidth_id = torch.tensor([0]).to(self.device)
+        wav = self.wavtokenizer.to(self.device).decode(features, bandwidth_id=bandwidth_id)
+        wav = wav.cpu().detach()
+        return wav
+    @torch.inference_mode()
+    def inference(self, text, audio_token, audio_token_len, text_token, text_token_len, embeddings=None,
+                  prompt_text=torch.zeros(1, 0, dtype=torch.int32),
+                  llm_prompt_audio_token=torch.zeros(1, 0, dtype=torch.int32),
+                  flow_prompt_audio_token=torch.zeros(1, 0, dtype=torch.int32),
+                  prompt_audio_feat=torch.zeros(1, 0, 80), sample_rate=48000, duration_to_gen = 30, task="continuation", trim = True, stream=False, **kwargs):
+        # this_uuid is used to track variables related to this inference thread
+        # support tasks:
+        # text to music task
+        # music continuation task
+        # require either audio input only or text and audio inputs
+        this_uuid = str(uuid.uuid1())
+        if self.llm:
+            with self.lock:
+                self.music_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False
+            p = threading.Thread(target=self.llm_job, args=(text_token, audio_token, audio_token_len, prompt_text, llm_prompt_audio_token, embeddings, this_uuid, duration_to_gen, task))
+            p.start()
+        if stream is True:
+            token_hop_len = self.token_min_hop_len
+            while True:
+                time.sleep(0.1)
+                if len(self.music_token_dict[this_uuid]) >= token_hop_len + self.token_overlap_len:
+                    this_music_audio = self.token2wav(token=text_token,
+                                                     token_len=text_token_len,
+                                                        uuid=this_uuid,
+                                                        sample_rate=sample_rate,
+                                                        finalize=False)
+                    yield {'music_audio': this_music_audio.cpu()}
+                    with self.lock:
+                        self.music_token_dict[this_uuid] = self.music_token_dict[this_uuid][token_hop_len:]
+                    # increase token_hop_len for better audio quality
+                    token_hop_len = min(self.token_max_hop_len, int(token_hop_len * self.stream_scale_factor))
+                if self.llm_end_dict[this_uuid] is True and len(self.music_token_dict[this_uuid]) < token_hop_len + self.token_overlap_len:
+                    break
+            p.join()
+            # deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
+            this_music_token = torch.concat(self.music_token_dict[this_uuid], dim=1)
+            with self.flow_hift_context:
+                this_music_audio = self.token2wav(token=this_music_token,
+                                                 prompt_token=flow_prompt_audio_token,
+                                                 prompt_feat=prompt_audio_feat,
+                                                 embedding=flow_embedding,
+                                                 uuid=this_uuid,
+                                                 sample_rate=sample_rate,
+                                                 finalize=True)
+            yield {'music_audio': this_music_audio.cpu()}
+        else:
+            # deal with all tokens
+            if self.fast:
+                if task == "reconstruct":
+                    assert audio_token is None
+                    this_music_token = audio_token
+                    this_music_audio = self.acoustictoken2wav(token=this_music_token)
+                else:
+                    if self.llm:
+                        p.join()
+                        print(len(self.music_token_dict[this_uuid]))
+                        this_music_token = torch.concat(self.music_token_dict[this_uuid], dim=1)
+                        print(this_music_token.shape)
+                    else:
+                        this_music_token = text_token
+                    logging.info("using wavtokenizer generator without flow matching")
+                    this_music_audio = self.semantictoken2wav(token=this_music_token)
+                    print(this_music_audio.shape)
+            else:
+                if self.llm:
+                    p.join()
+                    if len(self.music_token_dict[this_uuid]) != 0:
+                        this_music_token = torch.concat(self.music_token_dict[this_uuid], dim=1)
+                    else:
+                        print(f"The list of tensors is empty for UUID: {this_uuid}")
+                else:
+                    this_music_token = text_token
+                logging.info(f"LLM generated audio token length: {this_music_token.shape[1]}")
+                logging.info(f"using flow matching and {self.generator} generator")
+                if self.generator == "hifi":
+                    if (embeddings[1] - embeddings[0]) <= duration_to_gen:
+                        if trim:
+                            trim_length = (int((embeddings[1] - embeddings[0])*75))
+                            this_music_token = this_music_token[:, :trim_length]
+                            logging.info(f"After trimmed, generated audio token length: {this_music_token.shape[1]}")
+                    elif (embeddings[1] - embeddings[0]) < 1:
+                        logging.info(f"Given audio length={(embeddings[1] - embeddings[0])}, which is too short, please give a longer audio length.")
+                    this_music_audio = self.token2wav(token=this_music_token,
+                                                token_len=torch.LongTensor([this_music_token.size(1)]),
+                                                uuid=this_uuid,
+                                                sample_rate=sample_rate,
+                                                finalize=True)
+                    logging.info(f"Generated audio sequence length: {this_music_audio.shape[1]}")
+                elif self.generator == "wavtokenizer":
+                    if (embeddings[1] - embeddings[0]) < duration_to_gen:
+                        if trim:
+                            trim_length = (int((embeddings[1] - embeddings[0])*75))
+                            this_music_token = this_music_token[:,:trim_length]
+                            logging.info(f"After trimmed, generated audio token length: {this_music_token.shape[1]}")
+                    elif (embeddings[1] - embeddings[0]) < 1:
+                        logging.info(f"Given audio length={(embeddings[1] - embeddings[0])}, which is too short, please give a longer audio length.")
+                    this_music_audio = self.semantictoken2wav(token=this_music_token)
+            yield {'music_audio': this_music_audio.cpu()}
+            torch.cuda.synchronize()

inspiremusic/dataset/__init__.py ADDED Viewed

File without changes

inspiremusic/dataset/dataset.py ADDED Viewed

	@@ -0,0 +1,154 @@

+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#               2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+import json
+import math
+from functools import partial
+import torch
+import torch.distributed as dist
+from torch.utils.data import IterableDataset
+from inspiremusic.utils.file_utils import read_lists, read_json_lists
+class Processor(IterableDataset):
+    def __init__(self, source, f, *args, **kw):
+        assert callable(f)
+        self.source = source
+        self.f = f
+        self.args = args
+        self.kw = kw
+    def set_epoch(self, epoch):
+        self.source.set_epoch(epoch)
+    def __iter__(self):
+        """ Return an iterator over the source dataset processed by the
+            given processor.
+        """
+        assert self.source is not None
+        assert callable(self.f)
+        return self.f(iter(self.source), *self.args, **self.kw)
+    def apply(self, f):
+        assert callable(f)
+        return Processor(self, f, *self.args, **self.kw)
+class DistributedSampler:
+    def __init__(self, shuffle=True, partition=True):
+        self.epoch = -1
+        self.update()
+        self.shuffle = shuffle
+        self.partition = partition
+    def update(self):
+        assert dist.is_available()
+        if dist.is_initialized():
+            self.rank = dist.get_rank()
+            self.world_size = dist.get_world_size()
+        else:
+            self.rank = 0
+            self.world_size = 1
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is None:
+            self.worker_id = 0
+            self.num_workers = 1
+        else:
+            self.worker_id = worker_info.id
+            self.num_workers = worker_info.num_workers
+        return dict(rank=self.rank,
+                    world_size=self.world_size,
+                    worker_id=self.worker_id,
+                    num_workers=self.num_workers)
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+    def sample(self, data):
+        """ Sample data according to rank/world_size/num_workers
+            Args:
+                data(List): input data list
+            Returns:
+                List: data list after sample
+        """
+        data = list(range(len(data)))
+        # force datalist even
+        if self.partition:
+            if self.shuffle:
+                random.Random(self.epoch).shuffle(data)
+            if len(data) < self.world_size:
+                print(len(data), self.world_size)
+                data = data * math.ceil(self.world_size / len(data))
+                data = data[:self.world_size]
+            data = data[self.rank::self.world_size]
+        if len(data) < self.num_workers:
+            data = data * math.ceil(self.num_workers / len(data))
+            data = data[:self.num_workers]
+        data = data[self.worker_id::self.num_workers]
+        return data
+class DataList(IterableDataset):
+    def __init__(self, lists, shuffle=True, partition=True):
+        self.lists = lists
+        self.sampler = DistributedSampler(shuffle, partition)
+    def set_epoch(self, epoch):
+        self.sampler.set_epoch(epoch)
+    def __iter__(self):
+        sampler_info = self.sampler.update()
+        indexes = self.sampler.sample(self.lists)
+        for index in indexes:
+            data = dict(src=self.lists[index])
+            data.update(sampler_info)
+            yield data
+def Dataset(data_list_file,
+            data_pipeline,
+            mode='train',
+            shuffle=True,
+            partition=True
+            ):
+    """ Construct dataset from arguments
+        We have two shuffle stage in the Dataset. The first is global
+        shuffle at shards tar/raw file level. The second is global shuffle
+        at training samples level.
+        Args:
+            data_type(str): raw/shard
+            tokenizer (BaseTokenizer): tokenizer to tokenize
+            partition(bool): whether to do data partition in terms of rank
+    """
+    assert mode in ['train', 'inference', 'processing']
+    lists = read_lists(data_list_file)
+    dataset = DataList(lists,
+                       shuffle=shuffle,
+                       partition=partition)
+    for func in data_pipeline:
+        dataset = Processor(dataset, func, mode=mode)
+    return dataset

inspiremusic/dataset/processor.py ADDED Viewed

	@@ -0,0 +1,595 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import random
+import pyarrow.parquet as pq
+import torch
+import torchaudio
+from torch.nn.utils.rnn import pad_sequence
+import torch.nn.functional as F
+import numpy as np
+import re
+torchaudio.set_audio_backend('soundfile')
+AUDIO_FORMAT_SETS = {'flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'}
+CHORUS = {"intro": 0, "chorus": 1, "verse1": 2, "verse2": 3, "verse": 2,
+		  "outro": 4}
+metadata_pattern = re.compile(r'^\[(ti|ar|al|by|offset):.*\]$')
+timestamp_pattern = re.compile(r'^\[\d{2}:\d{2}\.\d{2}\](.*)$')
+def parquet_opener(data, mode='train', audio_data={}):
+	""" Give url or local file, return file descriptor
+        Inplace operation.
+        Args:
+            data(Iterable[str]): url or local file list
+        Returns:
+            Iterable[{src, stream}]
+    """
+	for sample in data:
+		assert 'src' in sample
+		url = sample['src']
+		try:
+			df = pq.read_table(url).to_pandas()
+			for i in df.index:
+				sample.update(dict(df.loc[i]))
+				yield {**sample}
+		except Exception as ex:
+			logging.warning('Failed to open {}, ex info {}'.format(url, ex))
+def clean_lyrics(data, mode="train"):
+	for sample in data:
+		lyrics = sample["text"]
+		cleaned = []
+		for line in lyrics.splitlines():
+			if metadata_pattern.match(line):
+				continue
+			timestamp_match = timestamp_pattern.match(line)
+			if timestamp_match:
+				lyric = timestamp_match.group(1).strip()
+				if lyric:
+					cleaned.append(lyric)
+			else:
+				if line.strip():
+					cleaned.append(line.strip())
+		sample["text"] = '\n'.join(cleaned)
+		yield sample
+def cut_by_length(data, max_length=8000, num_times=4, mode="train"):
+	for sample in data:
+		if "semantic_token" in sample:
+			sample["semantic_token"] = [
+				sample["semantic_token"][0][:max_length]]
+		if "acoustic_token" not in sample:
+			sample["acoustic_token"] = sample["speech_token"]
+		sample["acoustic_token"] = sample["acoustic_token"][
+								   :max_length * num_times]
+		yield sample
+def filter(data,
+           max_length=22500,  # 22500 #5min #10240
+           max_acoustic_length=45000,
+           min_length=10,
+           min_acoustic_length=150,
+           token_max_length=200,
+           token_min_length=1,
+           min_output_input_ratio=0.0005,
+           max_output_input_ratio=1,
+           mode='train'):
+	""" Filter sample according to feature and label length
+        Inplace operation.
+        Args::
+            data: Iterable[{key, wav, label, sample_rate}]
+            max_length: drop utterance which is greater than max_length(10ms)
+            min_length: drop utterance which is less than min_length(10ms)
+            token_max_length: drop utterance which is greater than
+                token_max_length, especially when use char unit for
+                english modeling
+            token_min_length: drop utterance which is
+                less than token_max_length
+            min_output_input_ratio: minimal ration of
+                token_length / feats_length(10ms)
+            max_output_input_ratio: maximum ration of
+                token_length / feats_length(10ms)
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+	if mode == "train":
+		for sample in data:
+			if "semantic_token" in sample:
+				new_sample_frames = sample['semantic_token'][0].shape[0]
+			else:
+				new_sample_frames = sample['speech_token']
+			if "text_token" in sample:
+				new_sample_frames += len(sample['text_token'])
+			if new_sample_frames > max_length or new_sample_frames < min_length:
+				print(f"skipped 1 item length={new_sample_frames}")
+				continue
+			sample["chorus"] = sample["chorus"].split(",")
+			if not isinstance(sample["time_start"], np.ndarray):
+				sample["time_start"] = [sample["time_start"]]
+				sample["time_end"] = [sample["time_end"]]
+			for i, t in enumerate(sample["chorus"]):
+				if sample["chorus"][i] == "verse":
+					sample["chorus"][i] = "verse1"
+			yield sample
+	if mode == "train_flow":
+		for sample in data:
+			if "semantic_token" in sample:
+				new_sample_frames = sample['semantic_token'][0].shape[0]
+			if "acoustic_token" in sample:
+				target_sample_frames = sample['acoustic_token'][0].shape[0]
+			if new_sample_frames > max_length or new_sample_frames < min_acoustic_length or new_sample_frames < min_length or target_sample_frames > max_acoustic_length:
+				print(
+					f"skipped 1 item length={new_sample_frames}, target_length={target_sample_frames}")
+				continue
+			yield sample
+	elif mode == "inference":
+		for sample in data:
+			yield sample
+def resample(data, resample_rate=22050, min_sample_rate=16000, mode='train'):
+	""" Resample data.
+        Inplace operation.
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+            resample_rate: target resample rate
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+	for sample in data:
+		assert 'sample_rate' in sample
+		assert 'speech' in sample
+		sample_rate = sample['sample_rate']
+		waveform = sample['speech']
+		if sample_rate != resample_rate:
+			if sample_rate < min_sample_rate:
+				continue
+			sample['sample_rate'] = resample_rate
+			sample['speech'] = torchaudio.transforms.Resample(
+					orig_freq=sample_rate, new_freq=resample_rate)(waveform)
+		max_val = sample['speech'].abs().max()
+		if max_val > 1:
+			sample['speech'] /= max_val
+		yield sample
+def truncate(data, truncate_length=24576, mode='train'):
+	""" Truncate data.
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+            truncate_length: truncate length
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+	for sample in data:
+		waveform = sample['audio']
+		if waveform.shape[1] > truncate_length:
+			start = random.randint(0, waveform.shape[1] - truncate_length)
+			waveform = waveform[:, start: start + truncate_length]
+		else:
+			waveform = torch.concat([waveform, torch.zeros(1, truncate_length -
+														   waveform.shape[1])],
+									dim=1)
+		sample['audio'] = waveform
+		yield sample
+def upsample(data, resample_rate=48000, min_sample_rate=16000, mode='train',
+			 n_codebook=4):
+	""" Resample data.
+        Inplace operation.
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+            resample_rate: target resample rate
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+	for sample in data:
+		assert 'semantic_token' in sample
+		# TODO: unify data processing key names
+		if 'acoustic_token' not in sample:
+			continue
+		if 'sample_rate' in sample.keys():
+			sample_rate = sample['sample_rate']
+		else:
+			sample_rate = 24000
+		token = np.array(sample['semantic_token'][0][:-1])
+		# Calculate the repetition factor for resampling
+		repetition_factor = int(n_codebook * resample_rate / sample_rate)
+		if sample_rate != resample_rate:
+			if sample_rate < min_sample_rate:
+				continue
+			sample['sample_rate'] = resample_rate
+			sample['semantic_token'] = np.array(
+					[np.repeat(token, repetition_factor)])
+		yield sample
+def compute_fbank(data,
+				  feat_extractor,
+				  mode='train'):
+	""" Extract fbank
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+	for sample in data:
+		assert 'sample_rate' in sample
+		assert 'speech' in sample
+		assert 'utt' in sample
+		assert 'text_token' in sample
+		waveform = sample['speech']
+		mat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
+		sample['speech_feat'] = mat
+		del sample['speech']
+		yield sample
+def parse_embedding(data, normalize, mode='train'):
+	""" Parse utt_embedding/spk_embedding
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+	for sample in data:
+		sample['utt_embedding'] = torch.tensor(sample['utt_embedding'],
+											   dtype=torch.float32)
+		sample['spk_embedding'] = torch.tensor(sample['spk_embedding'],
+											   dtype=torch.float32)
+		if normalize:
+			sample['utt_embedding'] = F.normalize(sample['utt_embedding'],
+												  dim=0)
+			sample['spk_embedding'] = F.normalize(sample['spk_embedding'],
+												  dim=0)
+		yield sample
+def tokenize(data, get_tokenizer, allowed_special, mode='train'):
+	""" Decode text to chars or BPE
+        Inplace operation
+        Args:
+            data: Iterable[{key, wav, txt, sample_rate}]
+        Returns:
+            Iterable[{key, wav, txt, tokens, label, sample_rate}]
+    """
+	tokenizer = get_tokenizer()
+	for sample in data:
+		assert 'text' in sample
+		sample['text_token'] = tokenizer.encode(sample['text'],
+												allowed_special=allowed_special)
+		yield sample
+def shuffle(data, shuffle_size=10000, mode='train'):
+	""" Local shuffle the data
+        Args:
+            data: Iterable[{key, feat, label}]
+            shuffle_size: buffer size for shuffle
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+	buf = []
+	for sample in data:
+		buf.append(sample)
+		if len(buf) >= shuffle_size:
+			random.shuffle(buf)
+			for x in buf:
+				yield x
+			buf = []
+	# The sample left over
+	random.shuffle(buf)
+	for x in buf:
+		yield x
+def sort(data, sort_size=500, mode='train'):
+	""" Sort the data by feature length.
+        Sort is used after shuffle and before batch, so we can group
+        utts with similar lengths into a batch, and `sort_size` should
+        be less than `shuffle_size`
+        Args:
+            data: Iterable[{key, feat, label}]
+            sort_size: buffer size for sort
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+	buf = []
+	for sample in data:
+		if sample["chorus"] == "verse":
+			sample["chorus"] = "verse1"
+		if sample["acoustic_token"].shape[0] == 1:
+			sample["acoustic_token"] = np.concatenate(
+					sample["acoustic_token"][0])
+		else:
+			sample["acoustic_token"] = np.concatenate(sample["acoustic_token"])
+		sample["acoustic_token"] = torch.from_numpy(sample["acoustic_token"])
+		buf.append(sample)
+		if len(buf) >= sort_size:
+			buf.sort(key=lambda x: x['acoustic_token'].size(0))
+			for x in buf:
+				yield x
+			buf = []
+	# The sample left over
+	buf.sort(key=lambda x: x['acoustic_token'].size(0))
+	for x in buf:
+		yield x
+def static_batch(data, batch_size=32):
+	""" Static batch the data by `batch_size`
+        Args:
+            data: Iterable[{key, feat, label}]
+            batch_size: batch size
+        Returns:
+            Iterable[List[{key, feat, label}]]
+    """
+	buf = []
+	data_empty = True
+	for sample in data:
+		data_empty = False
+		buf.append(sample)
+		if len(buf) >= batch_size:
+			yield buf
+			buf = []
+	if data_empty:
+		raise ValueError("data is empty")
+	if len(buf) > 0:
+		yield buf
+def dynamic_batch(data, max_frames_in_batch=12000, mode='train'):
+	""" Dynamic batch the data until the total frames in batch
+        reach `max_frames_in_batch`
+        Args:
+            data: Iterable[{key, feat, label}]
+            max_frames_in_batch: max_frames in one batch
+        Returns:
+            Iterable[List[{key, feat, label}]]
+    """
+	buf = []
+	longest_frames = 0
+	for sample in data:
+		assert 'acoustic_token' in sample
+		assert isinstance(sample['acoustic_token'], torch.Tensor)
+		if 'semantic_token' in sample:
+			new_sample_frames = sample['semantic_token'][0].shape[0]
+		else:
+			new_sample_frames = sample['semantic_token']
+		if "text_token" in sample:
+			new_sample_frames += len(sample['text_token'])
+		longest_frames = max(longest_frames, new_sample_frames)
+		frames_after_padding = longest_frames * (len(buf) + 1)
+		if frames_after_padding > max_frames_in_batch:
+			if len(buf) > 0:
+				yield buf
+			buf = [sample]
+			longest_frames = new_sample_frames
+		else:
+			buf.append(sample)
+	if len(buf) > 0:
+		yield buf
+def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000,
+		  mode='train'):
+	""" Wrapper for static/dynamic batch
+    """
+	if mode == 'inference':
+		return static_batch(data, 1)
+	elif mode == 'processing':
+		return static_batch(data, batch_size)
+	else:
+		if batch_type == 'static':
+			return static_batch(data, batch_size)
+		elif batch_type == 'dynamic':
+			return dynamic_batch(data, max_frames_in_batch)
+		else:
+			logging.fatal('Unsupported batch type {}'.format(batch_type))
+def padding(data, mode='train'):
+	""" Padding the data into training data
+        Args:
+            data: Iterable[List[{key, feat, label}]]
+        Returns:
+            Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)]
+    """
+	if mode == "train":
+		for sample in data:
+			assert isinstance(sample, list)
+			if len(sample) != 0:
+				acoustic_feat_len = torch.tensor(
+						[x['acoustic_token'].size(0) for x in sample],
+						dtype=torch.int32)
+				order = torch.argsort(acoustic_feat_len, descending=True)
+				utts = [sample[i]['utt'] for i in order]
+				acoustic_token = [
+					sample[i]['acoustic_token'].clone().to(torch.int32) for i in
+					order]
+				acoustic_token_len = torch.tensor(
+						[i.size(0) for i in acoustic_token], dtype=torch.int32)
+				acoustic_token = pad_sequence(acoustic_token,
+											  batch_first=True,
+											  padding_value=0)
+				text = [sample[i]['text'] for i in order]
+				text_token = [torch.tensor(sample[i]['text_token']).long() for i
+							  in order]
+				text_token_len = torch.tensor([i.size(0) for i in text_token],
+											  dtype=torch.int32)
+				text_token = pad_sequence(text_token, batch_first=True,
+										  padding_value=0)
+				time_start = torch.tensor(
+						[sample[i]['time_start'] for i in order])
+				time_end = torch.tensor([sample[i]['time_end'] for i in order])
+				if isinstance(sample[0]['chorus'], str):
+					chorus = torch.tensor(
+							[CHORUS[sample[i]['chorus']] for i in order])
+				else:
+					chorus = [
+						torch.tensor([CHORUS[t] for t in sample[i]['chorus']])
+						for i in order]
+					chorus = pad_sequence(chorus, batch_first=True,
+										  padding_value=-1)
+				batch = {
+					"utts"              : utts,
+					"acoustic_token"    : acoustic_token,
+					"acoustic_token_len": acoustic_token_len,
+					"time_start"        : time_start,
+					"time_end"          : time_end,
+					"chorus"            : chorus,
+					"text"              : text,
+					"text_token"        : text_token,
+					"text_token_len"    : text_token_len,
+				}
+				if "semantic_token" in sample[0]:
+					semantic_token = [
+						torch.tensor(sample[i]['semantic_token'][0],
+									 dtype=torch.int32) for i in order]
+					semantic_token_len = torch.tensor(
+							[i.size(0) for i in semantic_token],
+							dtype=torch.int32)
+					semantic_token = pad_sequence(semantic_token,
+												  batch_first=True,
+												  padding_value=0)
+					batch.update({"semantic_token"    : semantic_token,
+								  "semantic_token_len": semantic_token_len})
+				yield batch
+			else:
+				logging.info("WARNING: sample is empty []!")
+	elif mode == "inference":
+		for sample in data:
+			assert isinstance(sample, list)
+			utts = [sample[i]['utt'] for i in range(len(sample))]
+			text = [sample[i]['text'] for i in range(len(sample))]
+			text_token = [torch.tensor(sample[i]['text_token']).long() for i in
+						  range(len(sample))]
+			text_token_len = torch.tensor([i.size(0) for i in text_token],
+										  dtype=torch.int32)
+			text_token = pad_sequence(text_token, batch_first=True,
+									  padding_value=0)
+			time_start = torch.tensor(
+					[sample[i]['time_start'] for i in range(len(sample))])
+			time_end = torch.tensor(
+					[sample[i]['time_end'] for i in range(len(sample))])
+			if isinstance(sample[0]['chorus'], str):
+				chorus = torch.tensor([CHORUS[sample[i]['chorus']] for i in
+									   range(len(sample))])
+			else:
+				chorus = [torch.tensor([CHORUS[t] for t in sample[i]['chorus']])
+						  for i in range(len(sample))]
+				chorus = pad_sequence(chorus, batch_first=True,
+									  padding_value=-1)
+			if "acoustic_token" in sample[0]:
+				acoustic_token = [
+					sample[i]['acoustic_token'].clone().to(torch.int32) for i in
+					range(len(sample))]
+				acoustic_token_len = torch.tensor(
+						[i.size(0) for i in acoustic_token], dtype=torch.int32)
+				acoustic_token = pad_sequence(acoustic_token,
+											  batch_first=True,
+											  padding_value=0)
+			else:
+				acoustic_token = None
+				acoustic_token_len = None
+			batch = {
+				"utts"              : utts,
+				"acoustic_token"    : acoustic_token,
+				"acoustic_token_len": acoustic_token_len,
+				"time_start"        : time_start,
+				"time_end"          : time_end,
+				"chorus"            : chorus,
+				"text"              : text,
+				"text_token"        : text_token,
+				"text_token_len"    : text_token_len,
+			}
+			if "semantic_token" in sample[0]:
+				semantic_token = [torch.tensor(sample[i]['semantic_token'][0],
+											   dtype=torch.int32) for i in
+								  range(len(sample))]
+				semantic_token_len = torch.tensor(
+						[i.size(0) for i in semantic_token], dtype=torch.int32)
+				semantic_token = pad_sequence(semantic_token,
+											  batch_first=True,
+											  padding_value=0)
+				batch.update({"semantic_token"    : semantic_token,
+							  "semantic_token_len": semantic_token_len})
+			yield batch

inspiremusic/flow/decoder.py ADDED Viewed

	@@ -0,0 +1,277 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from einops import pack, rearrange, repeat
+from matcha.models.components.decoder import SinusoidalPosEmb, Block1D, ResnetBlock1D, Downsample1D, TimestepEmbedding, Upsample1D
+from matcha.models.components.transformer import BasicTransformerBlock
+class Transpose(torch.nn.Module):
+    def __init__(self, dim0: int, dim1: int):
+        super().__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+    def forward(self, x: torch.Tensor):
+        x = torch.transpose(x, self.dim0, self.dim1)
+        return x
+class CausalBlock1D(Block1D):
+    def __init__(self, dim: int, dim_out: int):
+        super(CausalBlock1D, self).__init__(dim, dim_out)
+        self.block = torch.nn.Sequential(
+            CausalConv1d(dim, dim_out, 3),
+            Transpose(1, 2),
+            nn.LayerNorm(dim_out),
+            Transpose(1, 2),
+            nn.Mish(),
+        )
+    def forward(self, x: torch.Tensor, mask: torch.Tensor):
+        output = self.block(x * mask)
+        return output * mask
+class CausalResnetBlock1D(ResnetBlock1D):
+    def __init__(self, dim: int, dim_out: int, time_emb_dim: int, groups: int = 8):
+        super(CausalResnetBlock1D, self).__init__(dim, dim_out, time_emb_dim, groups)
+        self.block1 = CausalBlock1D(dim, dim_out)
+        self.block2 = CausalBlock1D(dim_out, dim_out)
+class CausalConv1d(torch.nn.Conv1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = 'zeros',
+        device=None,
+        dtype=None
+    ) -> None:
+        super(CausalConv1d, self).__init__(in_channels, out_channels,
+                                           kernel_size, stride,
+                                           padding=0, dilation=dilation,
+                                           groups=groups, bias=bias,
+                                           padding_mode=padding_mode,
+                                           device=device, dtype=dtype)
+        assert stride == 1
+        self.causal_padding = (kernel_size - 1, 0)
+    def forward(self, x: torch.Tensor):
+        x = F.pad(x, self.causal_padding)
+        x = super(CausalConv1d, self).forward(x)
+        return x
+class ConditionalDecoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        channels=(256, 256),
+        dropout=0.05,
+        attention_head_dim=64,
+        n_blocks=1,
+        num_mid_blocks=2,
+        num_heads=4,
+        act_fn="snake",
+    ):
+        """
+        This decoder requires an input with the same shape of the target. So, if your text content
+        is shorter or longer than the outputs, please re-sampling it before feeding to the decoder.
+        """
+        super().__init__()
+        channels = tuple(channels)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.time_embeddings = SinusoidalPosEmb(in_channels)
+        time_embed_dim = channels[0] * 4
+        self.time_mlp = TimestepEmbedding(
+            in_channels=in_channels,
+            time_embed_dim=time_embed_dim,
+            act_fn="silu",
+        )
+        self.down_blocks = nn.ModuleList([])
+        self.mid_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        output_channel = in_channels
+        for i in range(len(channels)):  # pylint: disable=consider-using-enumerate
+            input_channel = output_channel
+            output_channel = channels[i]
+            is_last = i == len(channels) - 1
+            resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            downsample = (
+                Downsample1D(output_channel) if not is_last else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+            )
+            self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))
+        for _ in range(num_mid_blocks):
+            input_channel = channels[-1]
+            out_channels = channels[-1]
+            resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))
+        channels = channels[::-1] + (channels[0],)
+        for i in range(len(channels) - 1):
+            input_channel = channels[i] * 2
+            output_channel = channels[i + 1]
+            is_last = i == len(channels) - 2
+            resnet = ResnetBlock1D(
+                dim=input_channel,
+                dim_out=output_channel,
+                time_emb_dim=time_embed_dim,
+            )
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            upsample = (
+                Upsample1D(output_channel, use_conv_transpose=True)
+                if not is_last
+                else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+            )
+            self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
+        self.final_block = Block1D(channels[-1], channels[-1])
+        self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
+        self.initialize_weights()
+    def initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv1d):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.GroupNorm):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def forward(self, x, mask, mu, t, spks=None, cond=None):
+        """Forward pass of the UNet1DConditional model.
+        Args:
+            x (torch.Tensor): shape (batch_size, in_channels, time)
+            mask (_type_): shape (batch_size, 1, time)
+            t (_type_): shape (batch_size)
+            spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
+            cond (_type_, optional): placeholder for future use. Defaults to None.
+        Raises:
+            ValueError: _description_
+            ValueError: _description_
+        Returns:
+            _type_: _description_
+        """
+        t = self.time_embeddings(t).to(t.dtype)
+        t = self.time_mlp(t)
+        x = pack([x, mu], "b * t")[0]
+        if spks is not None:
+            spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
+            x = pack([x, spks], "b * t")[0]
+        if cond is not None:
+            x = pack([x, cond], "b * t")[0]
+        hiddens = []
+        masks = [mask]
+        for resnet, transformer_blocks, downsample in self.down_blocks:
+            mask_down = masks[-1]
+            x = resnet(x, mask_down, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            attn_mask = torch.matmul(mask_down.transpose(1, 2).contiguous(), mask_down)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+            hiddens.append(x)  # Save hidden states for skip connections
+            x = downsample(x * mask_down)
+            masks.append(mask_down[:, :, ::2])
+        masks = masks[:-1]
+        mask_mid = masks[-1]
+        for resnet, transformer_blocks in self.mid_blocks:
+            x = resnet(x, mask_mid, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            attn_mask = torch.matmul(mask_mid.transpose(1, 2).contiguous(), mask_mid)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+        for resnet, transformer_blocks, upsample in self.up_blocks:
+            mask_up = masks.pop()
+            skip = hiddens.pop()
+            x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
+            x = resnet(x, mask_up, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            attn_mask = torch.matmul(mask_up.transpose(1, 2).contiguous(), mask_up)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+            x = upsample(x * mask_up)
+        x = self.final_block(x, mask_up)
+        output = self.final_proj(x * mask_up)
+        return output * mask

inspiremusic/flow/flow.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import random
+from typing import Dict, Optional
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from omegaconf import DictConfig
+from inspiremusic.utils.mask import make_pad_mask
+from inspiremusic.music_tokenizer.vqvae import VQVAE
+class MaskedDiff(torch.nn.Module):
+    def __init__(self,
+                 input_size: int = 512,
+                 output_size: int = 128,
+                 output_type: str = "mel",
+                 vocab_size: int = 4096,
+                 input_frame_rate: int = 50,
+                 only_mask_loss: bool = True,
+                 encoder: torch.nn.Module = None,
+                 length_regulator: torch.nn.Module = None,
+                 decoder: torch.nn.Module = None,
+                 decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80,
+                                       'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine',
+                                                                 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}),
+                                       'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64,
+                                                          'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}},
+                 mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 128, 'sampling_rate': 48000,
+                                        'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 48000},
+                generator_model_dir: str = "pretrained_models/InspireMusic-Base/music_tokenizer",
+                num_codebooks: int = 4
+                ):
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+        self.decoder_conf = decoder_conf
+        self.mel_feat_conf = mel_feat_conf
+        self.vocab_size = vocab_size
+        self.output_type = output_type
+        self.input_frame_rate = input_frame_rate
+        logging.info(f"input frame rate={self.input_frame_rate}")
+        self.input_embedding = nn.Embedding(vocab_size, input_size)
+        self.encoder = encoder
+        self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
+        self.decoder = decoder
+        self.length_regulator = length_regulator
+        self.only_mask_loss = only_mask_loss
+        self.quantizer = VQVAE( f'{generator_model_dir}/config.json',
+                                  f'{generator_model_dir}/model.pt',with_encoder=True).quantizer
+        self.quantizer.eval()
+        self.num_codebooks  = num_codebooks
+        self.cond = None
+        self.interpolate = False
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        audio_token = batch['acoustic_token'].to(device)
+        audio_token_len = batch['acoustic_token_len'].to(device)
+        audio_token  = audio_token.view(audio_token.size(0),-1,self.num_codebooks)
+        if "semantic_token" not in batch:
+            token = audio_token[:,:,0]
+            token_len = (audio_token_len/self.num_codebooks).long()
+        else:
+            token = batch['semantic_token'].to(device)
+            token_len = batch['semantic_token_len'].to(device)
+        with torch.no_grad():
+            feat = self.quantizer.embed(audio_token)
+            feat_len = (audio_token_len/self.num_codebooks).long()
+        token = self.input_embedding(token)
+        h, h_lengths = self.encoder(token, token_len)
+        h, h_lengths = self.length_regulator(h, feat_len)
+        # get conditions
+        if self.cond:
+            conds = torch.zeros(feat.shape, device=token.device)
+            for i, j in enumerate(feat_len):
+                if random.random() < 0.5:
+                    continue
+                index = random.randint(0, int(0.3 * j))
+                conds[i, :index] = feat[i, :index]
+            conds = conds.transpose(1, 2)
+        else:
+            conds = None
+        mask = (~make_pad_mask(feat_len)).to(h)
+        loss, _ = self.decoder.compute_loss(
+                feat,
+                mask.unsqueeze(1),
+                h.transpose(1, 2).contiguous(),
+                None,
+                cond=conds
+        )
+        return {'loss': loss}
+    @torch.inference_mode()
+    def inference(self,
+                  token,
+                  token_len,
+                  sample_rate):
+        assert token.shape[0] == 1
+        token = self.input_embedding(torch.clamp(token, min=0))
+        h, h_lengths = self.encoder(token, token_len)
+        if sample_rate == 48000:
+            token_len = 2 * token_len
+        h, h_lengths = self.length_regulator(h, token_len)
+        # get conditions
+        conds = None
+        mask = (~make_pad_mask(token_len)).to(h)
+        feat = self.decoder(
+            mu=h.transpose(1, 2).contiguous(),
+            mask=mask.unsqueeze(1),
+            spks=None,
+            cond=conds,
+            n_timesteps=10
+        )
+        return feat

inspiremusic/flow/flow_matching.py ADDED Viewed

	@@ -0,0 +1,167 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn.functional as F
+from matcha.models.components.flow_matching import BASECFM
+class ConditionalCFM(BASECFM):
+    def __init__(self, in_channels, cfm_params, estimator: torch.nn.Module = None):
+        super().__init__(
+            n_feats=in_channels,
+            cfm_params=cfm_params,
+        )
+        self.t_scheduler = cfm_params.t_scheduler
+        self.training_cfg_rate = cfm_params.training_cfg_rate
+        self.inference_cfg_rate = cfm_params.inference_cfg_rate
+        # Just change the architecture of the estimator here
+        self.estimator = estimator
+    @torch.inference_mode()
+    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
+        """Forward diffusion
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            n_timesteps (int): number of diffusion steps
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        z = torch.randn_like(mu) * temperature
+        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
+        if self.t_scheduler == 'cosine':
+            t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
+        return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond)
+    def solve_euler(self, x, t_span, mu, mask, spks, cond):
+        """
+        Fixed euler solver for ODEs.
+        Args:
+            x (torch.Tensor): random noise
+            t_span (torch.Tensor): n_timesteps interpolated
+                shape: (n_timesteps + 1,)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        """
+        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
+        t = t.unsqueeze(dim=0)
+        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
+        # Or in future might add like a return_all_steps flag
+        sol = []
+        for step in range(1, len(t_span)):
+            dphi_dt = self.forward_estimator(x, mask, mu, t, spks, cond)
+            # Classifier-Free Guidance inference introduced in VoiceBox
+            if self.inference_cfg_rate > 0:
+                cfg_dphi_dt = self.forward_estimator(
+                    x, mask,
+                    torch.zeros_like(mu), t,
+                    torch.zeros_like(spks) if spks is not None else None,
+                    torch.zeros_like(cond) if cond is not None else None
+                )
+                dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt -
+                           self.inference_cfg_rate * cfg_dphi_dt)
+            x = x + dt * dphi_dt
+            t = t + dt
+            sol.append(x)
+            if step < len(t_span) - 1:
+                dt = t_span[step + 1] - t
+        return sol[-1]
+    def forward_estimator(self, x, mask, mu, t, spks, cond):
+        if isinstance(self.estimator, torch.nn.Module):
+            return self.estimator.forward(x, mask, mu, t, spks, cond)
+        elif isinstance(self.estimator, onnxruntime.InferenceSession):
+            ort_inputs = {
+                'x': x.cpu().numpy(),
+                'mask': mask.cpu().numpy(),
+                'mu': mu.cpu().numpy(),
+                't': t.cpu().numpy(),
+                'spks': spks.cpu().numpy(),
+                'cond': cond.cpu().numpy()
+            }
+            output = self.estimator.run(None, ort_inputs)[0]
+            return torch.tensor(output, dtype=x.dtype, device=x.device)
+        else:
+            self.estimator.set_input_shape('x', (2, 80, x.size(2)))
+            self.estimator.set_input_shape('mask', (2, 1, x.size(2)))
+            self.estimator.set_input_shape('mu', (2, 80, x.size(2)))
+            self.estimator.set_input_shape('t', (2,))
+            self.estimator.set_input_shape('spks', (2, 80))
+            self.estimator.set_input_shape('cond', (2, 80, x.size(2)))
+            # run trt engine
+            self.estimator.execute_v2([x.contiguous().data_ptr(),
+                                       mask.contiguous().data_ptr(),
+                                       mu.contiguous().data_ptr(),
+                                       t.contiguous().data_ptr(),
+                                       spks.contiguous().data_ptr(),
+                                       cond.contiguous().data_ptr(),
+                                       x.data_ptr()])
+            return x
+    def compute_loss(self, x1, mask, mu, spks=None, cond=None):
+        """Computes diffusion loss
+        Args:
+            x1 (torch.Tensor): Target
+                shape: (batch_size, n_feats, mo)
+            mask (torch.Tensor): target mask
+                shape: (batch_size, 1, mel_timesteps)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            spks (torch.Tensor, optional): speaker embedding. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+        Returns:
+            loss: conditional flow matching loss
+            y: conditional flow
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        b, _, t = mu.shape
+        t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
+        if self.t_scheduler == 'cosine':
+            t = 1 - torch.cos(t * 0.5 * torch.pi)
+        z = torch.randn_like(x1)
+        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
+        u = x1 - (1 - self.sigma_min) * z
+        # during training, we randomly drop condition to trade off mode coverage and sample fidelity
+        if self.training_cfg_rate > 0:
+            cfg_mask = torch.rand(b, device=x1.device) > self.training_cfg_rate
+            mu = mu * cfg_mask.view(-1, 1, 1)
+            if cond is not None:
+                cond = cond * cfg_mask.view(-1, 1, 1)
+        pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond)
+        loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
+        return loss, y

inspiremusic/flow/length_regulator.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Tuple
+import torch.nn as nn
+import torch
+from torch.nn import functional as F
+from inspiremusic.utils.mask import make_pad_mask
+class InterpolateRegulator(nn.Module):
+    def __init__(
+            self,
+            channels: int,
+            sampling_ratios: Tuple,
+            out_channels: int = None,
+            groups: int = 1,
+    ):
+        super().__init__()
+        self.sampling_ratios = sampling_ratios
+        out_channels = out_channels or channels
+        model = nn.ModuleList([])
+        if len(sampling_ratios) > 0:
+            for _ in sampling_ratios:
+                module = nn.Conv1d(channels, channels, 3, 1, 1)
+                norm = nn.GroupNorm(groups, channels)
+                act = nn.Mish()
+                model.extend([module, norm, act])
+        model.append(
+            nn.Conv1d(channels, out_channels, 1, 1)
+        )
+        self.model = nn.Sequential(*model)
+    def forward(self, x, ylens=None):
+        # x in (B, T, D)
+        mask = (~make_pad_mask(ylens)).to(x).unsqueeze(-1)
+        x = F.interpolate(x.transpose(1, 2).contiguous(), size=ylens.max(), mode='linear')
+        out = self.model(x).transpose(1, 2).contiguous()
+        olens = ylens
+        return out * mask, olens
+    def inference(self, x1, x2, mel_len1, mel_len2, input_frame_rate=50):
+        # in inference mode, interploate prompt token and token(head/mid/tail) seprately, so we can get a clear separation point of mel
+        # x in (B, T, D)
+        if x2.shape[1] > 40:
+            x2_head = F.interpolate(x2[:, :20].transpose(1, 2).contiguous(), size=int(20 / input_frame_rate * 22050 / 256), mode='linear')
+            x2_mid = F.interpolate(x2[:, 20:-20].transpose(1, 2).contiguous(), size=mel_len2 - int(20 / input_frame_rate * 22050 / 256) * 2,
+                                   mode='linear')
+            x2_tail = F.interpolate(x2[:, -20:].transpose(1, 2).contiguous(), size=int(20 / input_frame_rate * 22050 / 256), mode='linear')
+            x2 = torch.concat([x2_head, x2_mid, x2_tail], dim=2)
+        else:
+            x2 = F.interpolate(x2.transpose(1, 2).contiguous(), size=mel_len2, mode='linear')
+        if x1.shape[1] != 0:
+            x1 = F.interpolate(x1.transpose(1, 2).contiguous(), size=mel_len1, mode='linear')
+            x = torch.concat([x1, x2], dim=2)
+        else:
+            x = x2
+        out = self.model(x).transpose(1, 2).contiguous()
+        return out, mel_len1 + mel_len2

inspiremusic/hifigan/discriminator.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import torch
+import torch.nn as nn
+from torch.nn.utils import weight_norm
+from typing import List, Optional, Tuple
+from einops import rearrange
+from torchaudio.transforms import Spectrogram
+class MultipleDiscriminator(nn.Module):
+    def __init__(
+            self, mpd: nn.Module, mrd: nn.Module
+    ):
+        super().__init__()
+        self.mpd = mpd
+        self.mrd = mrd
+    def forward(self, y: torch.Tensor, y_hat: torch.Tensor):
+        y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], []
+        this_y_d_rs, this_y_d_gs, this_fmap_rs, this_fmap_gs = self.mpd(y.unsqueeze(dim=1), y_hat.unsqueeze(dim=1))
+        y_d_rs += this_y_d_rs
+        y_d_gs += this_y_d_gs
+        fmap_rs += this_fmap_rs
+        fmap_gs += this_fmap_gs
+        this_y_d_rs, this_y_d_gs, this_fmap_rs, this_fmap_gs = self.mrd(y, y_hat)
+        y_d_rs += this_y_d_rs
+        y_d_gs += this_y_d_gs
+        fmap_rs += this_fmap_rs
+        fmap_gs += this_fmap_gs
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class MultiResolutionDiscriminator(nn.Module):
+    def __init__(
+        self,
+        fft_sizes: Tuple[int, ...] = (2048, 1024, 512),
+        num_embeddings: Optional[int] = None,
+    ):
+        """
+        Multi-Resolution Discriminator module adapted from https://github.com/descriptinc/descript-audio-codec.
+        Additionally, it allows incorporating conditional information with a learned embeddings table.
+        Args:
+            fft_sizes (tuple[int]): Tuple of window lengths for FFT. Defaults to (2048, 1024, 512).
+            num_embeddings (int, optional): Number of embeddings. None means non-conditional discriminator.
+                Defaults to None.
+        """
+        super().__init__()
+        self.discriminators = nn.ModuleList(
+            [DiscriminatorR(window_length=w, num_embeddings=num_embeddings) for w in fft_sizes]
+        )
+    def forward(
+        self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
+    ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for d in self.discriminators:
+            y_d_r, fmap_r = d(x=y, cond_embedding_id=bandwidth_id)
+            y_d_g, fmap_g = d(x=y_hat, cond_embedding_id=bandwidth_id)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorR(nn.Module):
+    def __init__(
+        self,
+        window_length: int,
+        num_embeddings: Optional[int] = None,
+        channels: int = 32,
+        hop_factor: float = 0.25,
+        bands: Tuple[Tuple[float, float], ...] = ((0.0, 0.1), (0.1, 0.25), (0.25, 0.5), (0.5, 0.75), (0.75, 1.0)),
+    ):
+        super().__init__()
+        self.window_length = window_length
+        self.hop_factor = hop_factor
+        self.spec_fn = Spectrogram(
+            n_fft=window_length, hop_length=int(window_length * hop_factor), win_length=window_length, power=None
+        )
+        n_fft = window_length // 2 + 1
+        bands = [(int(b[0] * n_fft), int(b[1] * n_fft)) for b in bands]
+        self.bands = bands
+        convs = lambda: nn.ModuleList(
+            [
+                weight_norm(nn.Conv2d(2, channels, (3, 9), (1, 1), padding=(1, 4))),
+                weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
+                weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
+                weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
+                weight_norm(nn.Conv2d(channels, channels, (3, 3), (1, 1), padding=(1, 1))),
+            ]
+        )
+        self.band_convs = nn.ModuleList([convs() for _ in range(len(self.bands))])
+        if num_embeddings is not None:
+            self.emb = torch.nn.Embedding(num_embeddings=num_embeddings, embedding_dim=channels)
+            torch.nn.init.zeros_(self.emb.weight)
+        self.conv_post = weight_norm(nn.Conv2d(channels, 1, (3, 3), (1, 1), padding=(1, 1)))
+    def spectrogram(self, x):
+        # Remove DC offset
+        x = x - x.mean(dim=-1, keepdims=True)
+        # Peak normalize the volume of input audio
+        x = 0.8 * x / (x.abs().max(dim=-1, keepdim=True)[0] + 1e-9)
+        x = self.spec_fn(x)
+        x = torch.view_as_real(x)
+        x = rearrange(x, "b f t c -> b c t f")
+        # Split into bands
+        x_bands = [x[..., b[0]: b[1]] for b in self.bands]
+        return x_bands
+    def forward(self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None):
+        x_bands = self.spectrogram(x)
+        fmap = []
+        x = []
+        for band, stack in zip(x_bands, self.band_convs):
+            for i, layer in enumerate(stack):
+                band = layer(band)
+                band = torch.nn.functional.leaky_relu(band, 0.1)
+                if i > 0:
+                    fmap.append(band)
+            x.append(band)
+        x = torch.cat(x, dim=-1)
+        if cond_embedding_id is not None:
+            emb = self.emb(cond_embedding_id)
+            h = (emb.view(1, -1, 1, 1) * x).sum(dim=1, keepdims=True)
+        else:
+            h = 0
+        x = self.conv_post(x)
+        fmap.append(x)
+        x += h
+        return x, fmap

inspiremusic/hifigan/f0_predictor.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from torch.nn.utils import weight_norm
+class ConvRNNF0Predictor(nn.Module):
+    def __init__(self,
+                 num_class: int = 1,
+                 in_channels: int = 80,
+                 cond_channels: int = 512
+                 ):
+        super().__init__()
+        self.num_class = num_class
+        self.condnet = nn.Sequential(
+            weight_norm(
+                nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+        )
+        self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.condnet(x)
+        x = x.transpose(1, 2)
+        return torch.abs(self.classifier(x).squeeze(-1))

inspiremusic/hifigan/generator.py ADDED Viewed

	@@ -0,0 +1,411 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""HIFI-GAN"""
+from typing import Dict, Optional, List
+import numpy as np
+from scipy.signal import get_window
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Conv1d
+from torch.nn import ConvTranspose1d
+from torch.nn.utils import remove_weight_norm
+from torch.nn.utils import weight_norm
+from torch.distributions.uniform import Uniform
+from inspiremusic.transformer.activation import Snake
+from inspiremusic.utils.common import get_padding
+from inspiremusic.utils.common import init_weights
+"""hifigan based generator implementation.
+This code is modified from https://github.com/jik876/hifi-gan
+ ,https://github.com/kan-bayashi/ParallelWaveGAN and
+ https://github.com/NVIDIA/BigVGAN
+"""
+class ResBlock(torch.nn.Module):
+    """Residual block module in HiFiGAN/BigVGAN."""
+    def __init__(
+        self,
+        channels: int = 512,
+        kernel_size: int = 3,
+        dilations: List[int] = [1, 3, 5],
+    ):
+        super(ResBlock, self).__init__()
+        self.convs1 = nn.ModuleList()
+        self.convs2 = nn.ModuleList()
+        for dilation in dilations:
+            self.convs1.append(
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation,
+                        padding=get_padding(kernel_size, dilation)
+                    )
+                )
+            )
+            self.convs2.append(
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1)
+                    )
+                )
+            )
+        self.convs1.apply(init_weights)
+        self.convs2.apply(init_weights)
+        self.activations1 = nn.ModuleList([
+            Snake(channels, alpha_logscale=False)
+            for _ in range(len(self.convs1))
+        ])
+        self.activations2 = nn.ModuleList([
+            Snake(channels, alpha_logscale=False)
+            for _ in range(len(self.convs2))
+        ])
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for idx in range(len(self.convs1)):
+            xt = self.activations1[idx](x)
+            xt = self.convs1[idx](xt)
+            xt = self.activations2[idx](xt)
+            xt = self.convs2[idx](xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for idx in range(len(self.convs1)):
+            remove_weight_norm(self.convs1[idx])
+            remove_weight_norm(self.convs2[idx])
+class SineGen(torch.nn.Module):
+    """ Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(self, samp_rate, harmonic_num=0,
+                 sine_amp=0.1, noise_std=0.003,
+                 voiced_threshold=0):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = (f0 > self.voiced_threshold).type(torch.float32)
+        return uv
+    @torch.no_grad()
+    def forward(self, f0):
+        """
+        :param f0: [B, 1, sample_len], Hz
+        :return: [B, 1, sample_len]
+        """
+        F_mat = torch.zeros((f0.size(0), self.harmonic_num + 1, f0.size(-1))).to(f0.device)
+        for i in range(self.harmonic_num + 1):
+            F_mat[:, i: i + 1, :] = f0 * (i + 1) / self.sampling_rate
+        theta_mat = 2 * np.pi * (torch.cumsum(F_mat, dim=-1) % 1)
+        u_dist = Uniform(low=-np.pi, high=np.pi)
+        phase_vec = u_dist.sample(sample_shape=(f0.size(0), self.harmonic_num + 1, 1)).to(F_mat.device)
+        phase_vec[:, 0, :] = 0
+        # generate sine waveforms
+        sine_waves = self.sine_amp * torch.sin(theta_mat + phase_vec)
+        # generate uv signal
+        uv = self._f02uv(f0)
+        # noise: for unvoiced should be similar to sine_amp
+        #        std = self.sine_amp/3 -> max value ~ self.sine_amp
+        # .       for voiced regions is self.noise_std
+        noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+        noise = noise_amp * torch.randn_like(sine_waves)
+        # first: set the unvoiced part to 0 by uv
+        # then: additive noise
+        sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+class SourceModuleHnNSF(torch.nn.Module):
+    """ SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+    def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0):
+        super(SourceModuleHnNSF, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
+                                 sine_amp, add_noise_std, voiced_threshod)
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+    def forward(self, x):
+        """
+        Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+        F0_sampled (batchsize, length, 1)
+        Sine_source (batchsize, length, 1)
+        noise_source (batchsize, length 1)
+        """
+        # source for harmonic branch
+        with torch.no_grad():
+            sine_wavs, uv, _ = self.l_sin_gen(x.transpose(1, 2))
+            sine_wavs = sine_wavs.transpose(1, 2)
+            uv = uv.transpose(1, 2)
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        # source for noise branch, in the same shape as uv
+        noise = torch.randn_like(uv) * self.sine_amp / 3
+        return sine_merge, noise, uv
+class HiFTGenerator(nn.Module):
+    """
+    HiFTNet Generator: Neural Source Filter + ISTFTNet
+    https://arxiv.org/abs/2309.09493
+    """
+    def __init__(
+            self,
+            in_channels: int = 80,
+            base_channels: int = 512,
+            nb_harmonics: int = 8,
+            sampling_rate: int = 22050,
+            nsf_alpha: float = 0.1,
+            nsf_sigma: float = 0.003,
+            nsf_voiced_threshold: float = 10,
+            upsample_rates: List[int] = [8, 8],
+            upsample_kernel_sizes: List[int] = [16, 16],
+            istft_params: Dict[str, int] = {"n_fft": 16, "hop_len": 4},
+            resblock_kernel_sizes: List[int] = [3, 7, 11],
+            resblock_dilation_sizes: List[List[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+            source_resblock_kernel_sizes: List[int] = [7, 11],
+            source_resblock_dilation_sizes: List[List[int]] = [[1, 3, 5], [1, 3, 5]],
+            lrelu_slope: float = 0.1,
+            audio_limit: float = 0.99,
+            f0_predictor: torch.nn.Module = None,
+    ):
+        super(HiFTGenerator, self).__init__()
+        self.out_channels = 1
+        self.nb_harmonics = nb_harmonics
+        self.sampling_rate = sampling_rate
+        self.istft_params = istft_params
+        self.lrelu_slope = lrelu_slope
+        self.audio_limit = audio_limit
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.m_source = SourceModuleHnNSF(
+            sampling_rate=sampling_rate,
+            upsample_scale=np.prod(upsample_rates) * istft_params["hop_len"],
+            harmonic_num=nb_harmonics,
+            sine_amp=nsf_alpha,
+            add_noise_std=nsf_sigma,
+            voiced_threshod=nsf_voiced_threshold)
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates) * istft_params["hop_len"])
+        self.conv_pre = weight_norm(
+            Conv1d(in_channels, base_channels, 7, 1, padding=3)
+        )
+        # Up
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        base_channels // (2**i),
+                        base_channels // (2**(i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        # Down
+        self.source_downs = nn.ModuleList()
+        self.source_resblocks = nn.ModuleList()
+        downsample_rates = [1] + upsample_rates[::-1][:-1]
+        downsample_cum_rates = np.cumprod(downsample_rates)
+        for i, (u, k, d) in enumerate(zip(downsample_cum_rates[::-1], source_resblock_kernel_sizes, source_resblock_dilation_sizes)):
+            if u == 1:
+                self.source_downs.append(
+                    Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), 1, 1)
+                )
+            else:
+                self.source_downs.append(
+                    Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), u * 2, u, padding=(u // 2))
+                )
+            self.source_resblocks.append(
+                ResBlock(base_channels // (2 ** (i + 1)), k, d)
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = base_channels // (2**(i + 1))
+            for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(ResBlock(ch, k, d))
+        self.conv_post = weight_norm(Conv1d(ch, istft_params["n_fft"] + 2, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+        self.reflection_pad = nn.ReflectionPad1d((1, 0))
+        self.stft_window = torch.from_numpy(get_window("hann", istft_params["n_fft"], fftbins=True).astype(np.float32))
+        self.f0_predictor = f0_predictor
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+        self.m_source.remove_weight_norm()
+        for l in self.source_downs:
+            remove_weight_norm(l)
+        for l in self.source_resblocks:
+            l.remove_weight_norm()
+    def _stft(self, x):
+        spec = torch.stft(
+            x,
+            self.istft_params["n_fft"], self.istft_params["hop_len"], self.istft_params["n_fft"], window=self.stft_window.to(x.device),
+            return_complex=True)
+        spec = torch.view_as_real(spec)  # [B, F, TT, 2]
+        return spec[..., 0], spec[..., 1]
+    def _istft(self, magnitude, phase):
+        magnitude = torch.clip(magnitude, max=1e2)
+        real = magnitude * torch.cos(phase)
+        img = magnitude * torch.sin(phase)
+        inverse_transform = torch.istft(torch.complex(real, img), self.istft_params["n_fft"], self.istft_params["hop_len"],
+                                        self.istft_params["n_fft"], window=self.stft_window.to(magnitude.device))
+        return inverse_transform
+    def decode(self, x: torch.Tensor, s: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
+        s_stft_real, s_stft_imag = self._stft(s.squeeze(1))
+        s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, self.lrelu_slope)
+            x = self.ups[i](x)
+            if i == self.num_upsamples - 1:
+                x = self.reflection_pad(x)
+            # fusion
+            si = self.source_downs[i](s_stft)
+            si = self.source_resblocks[i](si)
+            x = x + si
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        magnitude = torch.exp(x[:, :self.istft_params["n_fft"] // 2 + 1, :])
+        phase = torch.sin(x[:, self.istft_params["n_fft"] // 2 + 1:, :])  # actually, sin is redundancy
+        x = self._istft(magnitude, phase)
+        x = torch.clamp(x, -self.audio_limit, self.audio_limit)
+        return x
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        speech_feat = batch['speech_feat'].transpose(1, 2).to(device)
+        # mel->f0
+        f0 = self.f0_predictor(speech_feat)
+        # f0->source
+        s = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+        s, _, _ = self.m_source(s)
+        s = s.transpose(1, 2)
+        # mel+source->speech
+        generated_speech = self.decode(x=speech_feat, s=s)
+        return generated_speech, f0
+    @torch.inference_mode()
+    def inference(self, speech_feat: torch.Tensor, cache_source: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
+        # mel->f0
+        f0 = self.f0_predictor(speech_feat)
+        # f0->source
+        s = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+        s, _, _ = self.m_source(s)
+        s = s.transpose(1, 2)
+        # use cache_source to avoid glitch
+        if cache_source.shape[2] != 0:
+            s[:, :, :cache_source.shape[2]] = cache_source
+        generated_speech = self.decode(x=speech_feat, s=s)
+        return generated_speech, s

inspiremusic/hifigan/hifigan.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from typing import Dict, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from matcha.hifigan.models import feature_loss, generator_loss, discriminator_loss
+from inspiremusic.utils.losses import tpr_loss, mel_loss
+class HiFiGan(nn.Module):
+    def __init__(self, generator, discriminator, mel_spec_transform,
+                 multi_mel_spectral_recon_loss_weight=45, feat_match_loss_weight=2.0,
+                 tpr_loss_weight=1.0, tpr_loss_tau=0.04):
+        super(HiFiGan, self).__init__()
+        self.generator = generator
+        self.discriminator = discriminator
+        self.mel_spec_transform = mel_spec_transform
+        self.multi_mel_spectral_recon_loss_weight = multi_mel_spectral_recon_loss_weight
+        self.feat_match_loss_weight = feat_match_loss_weight
+        self.tpr_loss_weight = tpr_loss_weight
+        self.tpr_loss_tau = tpr_loss_tau
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        if batch['turn'] == 'generator':
+            return self.forward_generator(batch, device)
+        else:
+            return self.forward_discriminator(batch, device)
+    def forward_generator(self, batch, device):
+        real_speech = batch['speech'].to(device)
+        pitch_feat = batch['pitch_feat'].to(device)
+        # 1. calculate generator outputs
+        generated_speech, generated_f0 = self.generator(batch, device)
+        # 2. calculate discriminator outputs
+        y_d_rs, y_d_gs, fmap_rs, fmap_gs = self.discriminator(real_speech, generated_speech)
+        # 3. calculate generator losses, feature loss, mel loss, tpr losses [Optional]
+        loss_gen, _ = generator_loss(y_d_gs)
+        loss_fm = feature_loss(fmap_rs, fmap_gs)
+        loss_mel = mel_loss(real_speech, generated_speech, self.mel_spec_transform)
+        if self.tpr_loss_weight != 0:
+            loss_tpr = tpr_loss(y_d_rs, y_d_gs, self.tpr_loss_tau)
+        else:
+            loss_tpr = torch.zeros(1).to(device)
+        loss_f0 = F.l1_loss(generated_f0, pitch_feat)
+        loss = loss_gen + self.feat_match_loss_weight * loss_fm + \
+            self.multi_mel_spectral_recon_loss_weight * loss_mel + \
+            self.tpr_loss_weight * loss_tpr + loss_f0
+        return {'loss': loss, 'loss_gen': loss_gen, 'loss_fm': loss_fm, 'loss_mel': loss_mel, 'loss_tpr': loss_tpr, 'loss_f0': loss_f0}
+    def forward_discriminator(self, batch, device):
+        real_speech = batch['speech'].to(device)
+        # 1. calculate generator outputs
+        with torch.no_grad():
+            generated_speech, generated_f0 = self.generator(batch, device)
+        # 2. calculate discriminator outputs
+        y_d_rs, y_d_gs, fmap_rs, fmap_gs = self.discriminator(real_speech, generated_speech)
+        # 3. calculate discriminator losses, tpr losses [Optional]
+        loss_disc, _, _ = discriminator_loss(y_d_rs, y_d_gs)
+        if self.tpr_loss_weight != 0:
+            loss_tpr = tpr_loss(y_d_rs, y_d_gs, self.tpr_loss_tau)
+        else:
+            loss_tpr = torch.zeros(1).to(device)
+        loss = loss_disc + self.tpr_loss_weight * loss_tpr
+        return {'loss': loss, 'loss_disc': loss_disc, 'loss_tpr': loss_tpr}

inspiremusic/llm/llm.py ADDED Viewed

	@@ -0,0 +1,409 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, Optional, Callable, List, Generator
+import torch
+from torch import nn
+from torch.nn.utils.rnn import pad_sequence, unpad_sequence
+from inspiremusic.utils.common import IGNORE_ID
+from inspiremusic.transformer.label_smoothing_loss import LabelSmoothingLoss
+from inspiremusic.utils.common import th_accuracy
+from torch import Tensor
+from math import log
+from einops import rearrange, reduce, repeat
+import logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+class SinusoidalEmbedding(nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x: Tensor) -> Tensor:
+        device, half_dim = x.device, self.dim // 2
+        emb = torch.tensor(log(10000) / (half_dim - 1), device=device)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = rearrange(x, "i -> i 1") * rearrange(emb, "j -> 1 j")
+        return torch.cat((emb.sin(), emb.cos()), dim=-1).to(torch.float16)
+class LLM(torch.nn.Module):
+    def __init__(
+            self,
+            text_encoder_input_size: int,
+            llm_input_size: int,
+            llm_output_size: int,
+            audio_token_size: int,
+            llm: torch.nn.Module,
+            sampling: Callable,
+            text_encoder_conf: Dict = None,
+            length_normalized_loss: bool = True,
+            lsm_weight: float = 0.0,
+            frozen_input_embed: bool = False,
+            **kwargs,
+    ):
+        super().__init__()
+        self.llm_input_size = llm_input_size
+        self.audio_token_size = audio_token_size
+        # 1. build text token inputs related modules
+        if llm is None:
+            self.text_embedding = torch.nn.Embedding(text_token_size, text_encoder_input_size)
+        else:
+            self.text_embedding = llm.model.model.embed_tokens
+            if frozen_input_embed:
+                print("Freezing input embedding layer")
+                for p in self.text_embedding.parameters():
+                    p.requires_grad = False
+        self.chorus_embedding = torch.nn.Embedding(5, llm_input_size)  # intro, chorus, verse1, verse2 , outro
+        self.text_encoder_conf = text_encoder_conf
+        self.text_encoder = self.build_encoder(text_encoder_conf)
+        self.infer_cfg_ratio = kwargs.get("infer_cfg_ratio", None)
+        logging.info(f"infer_cfg_ratio: {self.infer_cfg_ratio}")
+        self.train_cfg_ratio = kwargs.get("train_cfg_ratio", None)
+        logging.info(f"train_cfg_ratio: {self.train_cfg_ratio}")
+        # 2. build audio token language model related modules
+        self.sos_eos = 0
+        self.task_id = 1
+        self.llm_embedding = torch.nn.Embedding(2, llm_input_size)
+        self.llm = llm
+        self.llm_decoder = nn.Linear(llm_output_size, audio_token_size + 1)
+        self.criterion_ce = LabelSmoothingLoss(
+                size=audio_token_size + 1,
+                padding_idx=IGNORE_ID,
+                smoothing=lsm_weight,
+                normalize_length=length_normalized_loss,
+        )
+        # 3. [Optional] build audio token related modules
+        self.speech_embedding = torch.nn.Embedding(audio_token_size, llm_input_size)
+        self.spk_embed_affine_layer = torch.nn.Linear(192, llm_input_size)
+        self.num_codebooks = 4
+        # 4. sampling method
+        self.sampling = sampling
+        self.time_embedding = SinusoidalEmbedding(llm_input_size)
+    def cfg_dropout(self, text_token, text_token_len, p):
+        # Classifier-Free Guidance Dropout
+        B = text_token.size(0)
+        num_samples_to_mask = int(p * B)
+        if num_samples_to_mask == 0:
+            num_samples_to_mask = 1
+        indices_to_mask = torch.randperm(B, device=text_token.device)[:num_samples_to_mask]
+        text_token[indices_to_mask] = 0
+        text_token_len[indices_to_mask] = 0
+        return text_token, text_token_len
+    def build_encoder(self, encoder_conf=None):
+        if encoder_conf is None:
+            assert hasattr(self, "encoder_conf"), \
+                "function param encoder_conf is None and model doesn't has encoder_conf attribute either."
+            encoder_conf = self.encoder_conf
+        encoder_name = encoder_conf.pop("name", "transformer")
+        model = None
+        if encoder_name == "transformer":
+            from inspiremusic.transformer.encoder.conformer_encoder import ConformerEncoder
+            model = ConformerEncoder(
+                    **encoder_conf,
+                    input_size=self.input_size,
+                    use_cnn_module=False,
+                    macaron_style=False,
+            )
+        elif encoder_name == "conformer":
+            from inspiremusic.transformer.encoder.conformer_encoder import ConformerEncoder
+            model = ConformerEncoder(
+                    **encoder_conf,
+                    input_size=self.input_size,
+            )
+        elif encoder_name == "llama_encoder":
+            from inspiremusic.transformer.encoder.llama_encoder import LlamaEncoder
+            model = LlamaEncoder(
+                    **encoder_conf,
+                    input_size=self.input_size,
+            )
+        elif encoder_name == "qwen2":
+            from inspiremusic.transformer.encoder.qwen_encoder import QwenEncoder
+            model = QwenEncoder(
+                    **encoder_conf,
+                    input_size=self.input_size,
+            )
+        elif encoder_name == "qwen2.5":
+            from inspiremusic.transformer.encoder.qwen_encoder import QwenEncoder
+            model = QwenEncoder(
+                    **encoder_conf,
+                    input_size=self.input_size,
+            )
+        encoder_conf["name"] = encoder_name
+        return model
+    def encode(self,
+            text: torch.Tensor,
+            text_lengths: torch.Tensor):
+        if self.text_encoder is not None:
+            encoder_out, encoder_mask = self.text_encoder(text, text_lengths,
+                                                          decoding_chunk_size=1,
+                                                          num_decoding_left_chunks=-1)
+            encoder_out_lens = encoder_mask.squeeze(1).sum(1)
+            encoder_out = self.text_encoder_affine_layer(encoder_out)
+        else:
+            encoder_out, encoder_out_lens = text, text_lengths
+        return encoder_out, encoder_out_lens
+    def pad_unpad_sequence(self, sos_eos_emb, embeddings, text_token,
+                           text_token_len, task_id_emb, audio_token,
+                           audio_token_len, seg_len):
+        text_token = unpad_sequence(text_token, text_token_len.cpu(),
+                                    batch_first=True)
+        audio_token = unpad_sequence(audio_token, audio_token_len.cpu(),
+                                     batch_first=True)
+        for i in range(len(embeddings)):
+            embeddings[i] = unpad_sequence(embeddings[i], seg_len.cpu(), batch_first=True)
+        lm_input = [torch.concat([sos_eos_emb.squeeze(dim=0)] + [embedding[i] for embedding in embeddings] + [text_token[i], task_id_emb.squeeze(dim=0), audio_token[i]], dim=0) for i in range(len(text_token))]
+        lm_input_len = torch.tensor([i.size(0) for i in lm_input], dtype=torch.int32)
+        lm_input = pad_sequence(lm_input, batch_first=True, padding_value=IGNORE_ID)
+        return lm_input, lm_input_len
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        """
+        Args:
+            text: (B, L, D)
+            text_lengths: (B,)
+            audio: (B, T, N) or (B, T)
+            audio_lengths: (B,)
+        """
+        mask = True
+        text_token = batch['text_token'].to(device)
+        text_token_len = batch['text_token_len'].to(device)
+        if "semantic_token" not in batch:
+            audio_token = batch['acoustic_token'].to(device)
+            audio_token_len = batch['acoustic_token_len'].to(device)
+            audio_token = audio_token.view(audio_token.size(0), -1, self.num_codebooks)
+            audio_token = audio_token[:, :, 0]
+            audio_token_len = (audio_token_len / self.num_codebooks).long()
+        else:
+            audio_token = batch['semantic_token'].to(device)
+            audio_token_len = batch['semantic_token_len'].to(device)
+        time_start = batch['time_start'].to(device)
+        time_end = batch['time_end'].to(device)
+        chorus = batch['chorus'].to(device)
+        # 1. encode text_token
+        if self.train_cfg_ratio > 0:
+            # Classifier-Free Guidance
+            text_token, _ = self.cfg_dropout(text_token, text_token_len, self.train_cfg_ratio)
+        # 2. Time Embedding & chorus embedding
+        text_token = self.text_embedding(text_token)
+        text_token, text_token_len = self.encode(text_token, text_token_len)
+        if mask:
+            time_mask = time_start != -1.0
+            seg_len = time_mask.sum(-1)
+            time_start = time_start.masked_fill(~time_mask, 0.0)
+            time_end = time_end.masked_fill(~time_mask, 0.0)
+            chorus = chorus.masked_fill(~time_mask, 0)
+            time_start_embed = self.time_embedding(time_start.view(-1)).to(text_token.dtype)
+            time_end_embed = self.time_embedding(time_end.view(-1)).to(text_token.dtype)
+            time_start_embed = time_start_embed.view(chorus.size(0), chorus.size(1), -1)
+            time_end_embed = time_end_embed.view(chorus.size(0), chorus.size(1), -1)
+            chorus_embed = self.chorus_embedding(chorus)
+            lm_target = [torch.tensor([IGNORE_ID] * (1 + 3 * seg_len[i] + text_token_len[i]) + audio_token[i,:audio_token_len[i]].tolist() + [self.audio_token_size]) for i in range(text_token.size(0))]
+        else:
+            time_start_embed = self.time_embedding(time_start).to(text_token.dtype)
+            time_end_embed = self.time_embedding(time_end).to(text_token.dtype)
+            chorus_embed = self.chorus_embedding(chorus)
+            lm_target = [torch.tensor(
+                [IGNORE_ID] * (4 + text_token_len[i]) + audio_token[i,:audio_token_len[i]].tolist() + [self.audio_token_size]) for i in range(text_token.size(0))]
+        lm_target = pad_sequence(lm_target, batch_first=True, padding_value=IGNORE_ID).to(device)
+        # 3. eos and task_id
+        sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
+        task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
+        # 4. encode audio_token
+        audio_token = self.speech_embedding(audio_token)
+        # 5. unpad and pad
+        lm_input, lm_input_len = self.pad_unpad_sequence(sos_eos_emb,
+                                                         [time_start_embed,
+                                                          time_end_embed,
+                                                          chorus_embed],
+                                                         text_token,
+                                                         text_token_len,
+                                                         task_id_emb,
+                                                         audio_token,
+                                                         audio_token_len,
+                                                         seg_len)
+        # 6. run lm forward
+        lm_output, lm_output_mask = self.llm(lm_input, lm_input_len.to(device))
+        logits = self.llm_decoder(lm_output)
+        loss = self.criterion_ce(logits, lm_target)
+        acc = th_accuracy(logits.view(-1, self.audio_token_size + 1), lm_target, ignore_label=IGNORE_ID)
+        return {'loss': loss, 'acc': acc}
+    def sampling_ids(
+            self,
+            weighted_scores: torch.Tensor,
+            decoded_tokens: List,
+            ignore_eos: bool = True,
+    ):
+        top_ids = self.sampling(weighted_scores, decoded_tokens)
+        return top_ids
+    @torch.inference_mode()
+    def inference(
+            self,
+            text: torch.Tensor,
+            text_len: torch.Tensor,
+            audio_token: torch.Tensor,
+            audio_token_len: torch.Tensor,
+            prompt_text: torch.Tensor,
+            prompt_text_len: torch.Tensor,
+            prompt_audio_token: torch.Tensor,
+            prompt_audio_token_len: torch.Tensor,
+            embeddings: List,
+            duration_to_gen: float = 300,
+            task: str = "continuation",
+            token_rate: int = 75,
+            limit_audio_prompt_len: int = 5,
+    ) -> Generator[torch.Tensor, None, None]:
+        device = text.device
+        if text is not None:
+            text = torch.concat([prompt_text, text], dim=1)
+            text_len += prompt_text_len
+            infer_cfg = self.infer_cfg_ratio >= 0.0
+            if infer_cfg:
+                text_cfg = self.text_embedding(text.new_zeros(text.shape))
+            text = self.text_embedding(text)
+            # 1. encode text
+            text, text_len = self.encode(text, text_len)
+        # 2. encode embedding
+        if embeddings is not None:
+            time_start, time_end, chorus = embeddings
+            if len(chorus.shape) == 1:
+                time_start_embed = self.time_embedding(time_start).reshape(1, 1, -1)  # .half()
+                time_end_embed = self.time_embedding(time_end).reshape(1, 1, -1)  # .half()
+                chorus_embed = self.chorus_embedding(chorus).reshape(1, 1, -1)  # .half()
+            else:
+                time_start_embed = self.time_embedding(
+                    time_start.view(-1)).reshape(1, chorus.size(1), -1)  # .half()
+                time_end_embed = self.time_embedding(time_end.view(-1)).reshape(1, chorus.size(1), -1)  # .half()
+                chorus_embed = self.chorus_embedding(chorus)  # .half()
+        # 3. concat llm_input
+        sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
+        task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
+        if audio_token_len:
+            audio_token = audio_token[:, :(limit_audio_prompt_len * token_rate)]
+            audio_token_emb = self.speech_embedding(audio_token)
+        else:
+            audio_token_emb = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device)
+        if prompt_audio_token_len:
+            prompt_audio_token_emb = self.speech_embedding(prompt_audio_token)
+        else:
+            prompt_audio_token_emb = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device)
+        # Check if removing prompt audio token will fail decoding.
+        if task == "continuation":
+            lm_input = torch.concat(
+                    [sos_eos_emb, time_start_embed, time_end_embed,
+                     chorus_embed, text, task_id_emb, audio_token_emb], dim=1)
+            if infer_cfg:
+                audio_cfg = self.speech_embedding(
+                    audio_token.new_zeros(audio_token.shape))
+                lm_cf_input = torch.concat(
+                        [sos_eos_emb, torch.rand_like(time_start_embed),
+                         torch.rand_like(time_end_embed),
+                         torch.rand_like(chorus_embed), text_cfg, task_id_emb,
+                         audio_cfg], dim=1)
+                lm_input = torch.cat([lm_input, lm_cf_input], 0)
+        else:
+            lm_input = torch.concat(
+                    [sos_eos_emb, time_start_embed, time_end_embed,
+                     chorus_embed, text, task_id_emb], dim=1)
+            if infer_cfg:
+                lm_cf_input = torch.concat(
+                        [sos_eos_emb, torch.rand_like(time_start_embed),
+                         torch.rand_like(time_end_embed),
+                         torch.rand_like(chorus_embed), text_cfg, task_id_emb],
+                        dim=1)
+                lm_input = torch.cat([lm_input, lm_cf_input], 0)
+        # 4. cal min/max_length
+        min_len = 0.9 * duration_to_gen * token_rate
+        max_len = duration_to_gen * token_rate
+        logging.info(
+            f"LLM generation sequence length: {max_len}, generate audio length {duration_to_gen}s.")
+        # 5. step by step decode
+        out_tokens = []
+        offset = 0
+        state = None
+        for i in range(int(max_len)):
+            y_pred, _, state = self.llm.forward_one_step(lm_input, torch.ones(lm_input.shape[0], lm_input.shape[1], device=lm_input.device).to(torch.bool), cache=state)
+            logits = self.llm_decoder(y_pred[:, -1])
+            if infer_cfg:
+                # perform context free guidance
+                logits_cf = logits[1]
+                logits = logits[0]
+                infer_cfg_ratio = self.infer_cfg_ratio
+                logits = infer_cfg_ratio * logits + (1 - infer_cfg_ratio) * logits_cf
+            logp = logits.log_softmax(dim=-1)
+            logp = logp.squeeze(dim=0)
+            if i < int(min_len):
+                logp[self.audio_token_size] = torch.tensor(float('-inf'), dtype=torch.float16)
+            if i < int(min_len):
+                logp[self.audio_token_size] = torch.tensor(float('-inf'), dtype=torch.float16)
+            top_ids = self.sampling_ids(logp, out_tokens, ignore_eos=i < min_len).item()
+            if top_ids == self.audio_token_size:
+                break
+            # # in stream mode, yield token one by one
+            yield torch.tensor([[top_ids]], dtype=torch.int64, device=device)
+            out_tokens.append(top_ids)
+            offset += lm_input.size(1)
+            lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
+            if infer_cfg:
+                lm_input = lm_input.repeat(2, 1, 1)

inspiremusic/metrics/clap_score.py ADDED Viewed

	@@ -0,0 +1,135 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import requests
+from tqdm import tqdm
+import torch
+import numpy as np
+import laion_clap
+from clap_module.factory import load_state_dict
+import librosa
+import pyloudnorm as pyln
+# following documentation from https://github.com/LAION-AI/CLAP
+def int16_to_float32(x):
+    return (x / 32767.0).astype(np.float32)
+def float32_to_int16(x):
+    x = np.clip(x, a_min=-1., a_max=1.)
+    return (x * 32767.).astype(np.int16)
+def clap_score(id2text, audio_path, audio_files_extension='.wav', clap_model='music_audioset_epoch_15_esc_90.14.pt'):
+    """
+    Cosine similarity is computed between the LAION-CLAP text embedding of the given prompt and
+    the LAION-CLAP audio embedding of the generated audio. LION-CLAP: https://github.com/LAION-AI/CLAP
+    This evaluation script assumes that audio_path files are identified with the ids in id2text.
+    clap_score() evaluates all ids in id2text.
+    GPU-based computation.
+    Select one of the following models from https://github.com/LAION-AI/CLAP:
+        - music_speech_audioset_epoch_15_esc_89.98.pt (used by musicgen)
+        - music_audioset_epoch_15_esc_90.14.pt
+        - music_speech_epoch_15_esc_89.25.pt
+        - 630k-audioset-fusion-best.pt (our default, with "fusion" to handle longer inputs)
+    Params:
+    -- id2text: dictionary with the mapping between id (generated audio filenames in audio_path)
+                and text (prompt used to generate audio). clap_score() evaluates all ids in id2text.
+    -- audio_path: path where the generated audio files to evaluate are available.
+    -- audio_files_extension: files extension (default .wav) in eval_path.
+    -- clap_model: choose one of the above clap_models (default: '630k-audioset-fusion-best.pt').
+    Returns:
+    -- CLAP-LION score
+    """
+    # load model
+    if clap_model == 'music_speech_audioset_epoch_15_esc_89.98.pt':
+        url = 'https://huggingface.co/lukewys/laion_clap/resolve/main/music_speech_audioset_epoch_15_esc_89.98.pt'
+        clap_path = 'CLAP/music_speech_audioset_epoch_15_esc_89.98.pt'
+        model = laion_clap.CLAP_Module(enable_fusion=False, amodel='HTSAT-base',  device='cuda')
+    elif clap_model == 'music_audioset_epoch_15_esc_90.14.pt':
+        url = 'https://huggingface.co/lukewys/laion_clap/resolve/main/music_audioset_epoch_15_esc_90.14.pt'
+        clap_path = 'CLAP/music_audioset_epoch_15_esc_90.14.pt'
+        model = laion_clap.CLAP_Module(enable_fusion=False, amodel='HTSAT-base',  device='cuda')
+    elif clap_model == 'music_speech_epoch_15_esc_89.25.pt':
+        url = 'https://huggingface.co/lukewys/laion_clap/resolve/main/music_speech_epoch_15_esc_89.25.pt'
+        clap_path = 'CLAP/music_speech_epoch_15_esc_89.25.pt'
+        model = laion_clap.CLAP_Module(enable_fusion=False, amodel='HTSAT-base',  device='cuda')
+    elif clap_model == '630k-audioset-fusion-best.pt':
+        url = 'https://huggingface.co/lukewys/laion_clap/resolve/main/630k-audioset-fusion-best.pt'
+        clap_path = 'CLAP/630k-audioset-fusion-best.pt'
+        model = laion_clap.CLAP_Module(enable_fusion=True, device='cuda')
+    else:
+        raise ValueError('clap_model not implemented')
+    # download clap_model if not already downloaded
+    if not os.path.exists(clap_path):
+        print('Downloading ', clap_model, '...')
+        os.makedirs(os.path.dirname(clap_path), exist_ok=True)
+        response = requests.get(url, stream=True)
+        total_size = int(response.headers.get('content-length', 0))
+        with open(clap_path, 'wb') as file:
+            with tqdm(total=total_size, unit='B', unit_scale=True) as progress_bar:
+                for data in response.iter_content(chunk_size=8192):
+                    file.write(data)
+                    progress_bar.update(len(data))
+    # fixing CLAP-LION issue, see: https://github.com/LAION-AI/CLAP/issues/118
+    pkg = load_state_dict(clap_path)
+    pkg.pop('text_branch.embeddings.position_ids', None)
+    model.model.load_state_dict(pkg)
+    model.eval()
+    if not os.path.isdir(audio_path):
+        raise ValueError(f'audio_path: {audio_path} does not exist')
+    if id2text:
+        print('[EXTRACTING TEXT EMBEDDINGS] ')
+        batch_size = 64
+        text_emb = {}
+        for i in tqdm(range(0, len(id2text), batch_size)):
+            batch_ids = list(id2text.keys())[i:i+batch_size]
+            batch_texts = [id2text[id] for id in batch_ids]
+            with torch.no_grad():
+                embeddings = model.get_text_embedding(batch_texts, use_tensor=True)
+            for id, emb in zip(batch_ids, embeddings):
+                text_emb[id] = emb
+    else:
+        raise ValueError('Must specify id2text')
+    print('[EVALUATING GENERATIONS] ', audio_path)
+    score = 0
+    count = 0
+    for id in tqdm(id2text.keys()):
+        file_path = os.path.join(audio_path, str(id)+audio_files_extension)
+        if os.path.isfile(file_path):
+            with torch.no_grad():
+                audio, _ = librosa.load(file_path, sr=48000, mono=True) # sample rate should be 48000
+                audio = pyln.normalize.peak(audio, -1.0)
+                audio = audio.reshape(1, -1) # unsqueeze (1,T)
+                audio = torch.from_numpy(int16_to_float32(float32_to_int16(audio))).float()
+                audio_embeddings = model.get_audio_embedding_from_data(x = audio, use_tensor=True)
+            cosine_sim = torch.nn.functional.cosine_similarity(audio_embeddings, text_emb[id].unsqueeze(0), dim=1, eps=1e-8)[0]
+            print(f"{id} | CLAP score = {cosine_sim}")
+            score += cosine_sim
+            count += 1
+    return score / count if count > 0 else 0

inspiremusic/metrics/openl3_fd.py ADDED Viewed

	@@ -0,0 +1,338 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import openl3
+import librosa
+import numpy as np
+from scipy import linalg
+import glob
+from tqdm import tqdm
+import os
+import soxr
+import pyloudnorm as pyln
+def calculate_embd_statistics(embd_lst):
+    if isinstance(embd_lst, list):
+        embd_lst = np.array(embd_lst)
+    mu = np.mean(embd_lst, axis=0)
+    sigma = np.cov(embd_lst, rowvar=False)
+    return mu, sigma
+def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
+    """
+    Adapted from: https://github.com/mseitzer/pytorch-fid/blob/master/src/pytorch_fid/fid_score.py
+    Adapted from: https://github.com/gudgud96/frechet-audio-distance/blob/main/frechet_audio_distance/fad.py
+    Numpy implementation of the Frechet Distance.
+    The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
+    and X_2 ~ N(mu_2, C_2) is
+            d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
+    Params:
+    -- mu1: Embedding's mean statistics for generated samples.
+    -- mu2: Embedding's mean statistics for reference samples.
+    -- sigma1: Covariance matrix over embeddings for generated samples.
+    -- sigma2: Covariance matrix over embeddings for reference samples.
+    Returns:
+    --  Fréchet Distance.
+    """
+    mu1 = np.atleast_1d(mu1)
+    mu2 = np.atleast_1d(mu2)
+    sigma1 = np.atleast_2d(sigma1)
+    sigma2 = np.atleast_2d(sigma2)
+    assert mu1.shape == mu2.shape, \
+        'Training and test mean vectors have different lengths'
+    assert sigma1.shape == sigma2.shape, \
+        'Training and test covariances have different dimensions'
+    diff = mu1 - mu2
+    # product might be almost singular
+    covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+    if not np.isfinite(covmean).all():
+        msg = ('fid calculation produces singular product; '
+            'adding %s to diagonal of cov estimates') % eps
+        print(msg)
+        offset = np.eye(sigma1.shape[0]) * eps
+        covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+    # numerical error might give slight imaginary component
+    if np.iscomplexobj(covmean):
+        if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+            m = np.max(np.abs(covmean.imag))
+            raise ValueError('Imaginary component {}'.format(m))
+        covmean = covmean.real
+    tr_covmean = np.trace(covmean)
+    return (diff.dot(diff) + np.trace(sigma1)
+            + np.trace(sigma2) - 2 * tr_covmean)
+def extract_embeddings(directory_path, channels, samplingrate, content_type, openl3_hop_size, batch_size=16):
+    """
+    Given a list of files, compute their embeddings in batches.
+    If channels == 1: stereo audio is downmixed to mono. Mono embeddings are of dim=512.
+    If channels == 2: mono audio is "faked" to stereo by copying the mono channel.
+    Stereo embeddings are of dim=1024, since we concatenate L (dim=512) and R (dim=512) embeddings.
+    Params:
+    -- directory_path: path where the generated audio files are available.
+    -- channels: 1 (mono), or 2 (stereo) to get mono or stereo embeddings.
+    -- samplingrate: max bandwidth at which we evaluate the given signals. Up to 48kHz.
+    -- content_type: 'music' or 'env' to select a content type specific openl3 model.
+    -- openl3_hop_size: analysis resolution of openl3 in seconds. Openl3's input window is 1 sec.
+    -- batch_size: number of audio files to process in each batch.
+    Returns:
+    -- list of embeddings: [np.array[], ...], as expected by calculate_frechet_distance()
+    """
+    _, extension = os.path.splitext(directory_path)
+    if extension.lower() == ".scp":
+        wav_files = []
+        with open(directory_path, "r") as f:
+            for line in f:
+                sec = line.strip().split(" ")
+                wav_files.append(sec[1])
+    else:
+        wav_files = glob.glob(directory_path)
+    if len(wav_files) == 0:
+        raise ValueError('No files with this extension in this path!')
+    model = openl3.models.load_audio_embedding_model(input_repr="mel256", content_type=content_type, embedding_size=512)
+    first = True
+    for i in tqdm(range(0, len(wav_files), batch_size)):
+        batch_files = wav_files[i:i+batch_size]
+        batch_audio_l = []
+        batch_audio_r = []
+        batch_sr = []
+        for file in batch_files:
+            audio, sr = librosa.load(file, sr=None, mono=False)
+            audio = audio.T
+            audio = pyln.normalize.peak(audio, -1.0)
+            if audio.shape[0] < sr:
+                print('Audio shorter than 1 sec, openl3 will zero-pad it:', file, audio.shape, sr)
+            # resample to the desired evaluation bandwidth
+            audio = soxr.resample(audio, sr, samplingrate) # mono/stereo <- mono/stereo, input sr, output sr
+            # mono embeddings are stored in batch_audio_l (R channel not used)
+            if channels == 1:
+                batch_audio_l.append(audio)
+            elif channels == 2:
+                if audio.ndim == 1:
+                    # if mono, "fake" stereo by copying mono channel to L and R
+                    batch_audio_l.append(audio)
+                    batch_audio_r.append(audio)
+                elif audio.ndim == 2:
+                    # if it's stereo separate channels for openl3
+                    batch_audio_l.append(audio[:,0])
+                    batch_audio_r.append(audio[:,1])
+            batch_sr.append(samplingrate)
+        # extracting mono embeddings (dim=512) or the L channel for stereo embeddings
+        emb, _ = openl3.get_audio_embedding(batch_audio_l, batch_sr, model=model, verbose=False, hop_size=openl3_hop_size, batch_size=batch_size)
+        # format mono embedding
+        if channels == 1:
+            emb = np.concatenate(emb,axis=0)
+        # extracting stereo embeddings (dim=1024), since we concatenate L (dim=512) and R (dim=512) embeddings
+        elif channels == 2:
+            # extract the missing R channel
+            emb_r, _ = openl3.get_audio_embedding(batch_audio_r, batch_sr, model=model, verbose=False, hop_size=openl3_hop_size, batch_size=batch_size)
+            emb = [np.concatenate([l, r], axis=1) for l, r in zip(emb, emb_r)]
+            emb = np.concatenate(emb, axis=0)
+        # concatenate embeddings
+        if first:
+            embeddings = emb
+            first = False
+        else:
+            embeddings = np.concatenate([embeddings, emb], axis=0)
+    # return as a list of embeddings: [np.array[], ...]
+    return [e for e in embeddings]
+def extract_embeddings_nobatching(directory_path, channels, samplingrate, content_type, openl3_hop_size):
+    """
+    Given a list of files, compute their embeddings one by one.
+    If channels == 1: stereo audio is downmixed to mono. Mono embeddings are of dim=512.
+    If channels == 2: mono audio is "faked" to stereo by copying the mono channel.
+    Stereo embeddings are of dim=1024, since we concatenate L (dim=512) and R (dim=512) embeddings.
+    Params:
+    -- directory_path: path where the generated audio files are available.
+    -- channels: 1 (mono), or 2 (stereo) to get mono or stereo embeddings.
+    -- samplingrate: max bandwidth at which we evaluate the given signals. Up to 48kHz.
+    -- content_type: 'music' or 'env' to select a content type specific openl3 model.
+    -- openl3_hop_size: analysis resolution of openl3 in seconds. Openl3's input window is 1 sec.
+    Returns:
+    -- list of embeddings: [np.array[], ...], as expected by calculate_frechet_distance()
+    """
+    _, extension = os.path.splitext(directory_path)
+    if extension.lower() == ".scp":
+        wav_files = []
+        with open(directory_path, "r") as f:
+            for line in f:
+                sec = line.strip().split(" ")
+                wav_files.append(sec[1])
+    else:
+        wav_files = glob.glob(directory_path)
+    if len(wav_files) == 0:
+        raise ValueError('No files with this extension in this path!')
+    model = openl3.models.load_audio_embedding_model(input_repr="mel256", content_type=content_type, embedding_size=512)
+    first = True
+    for file in tqdm(wav_files):
+        audio, sr = librosa.load(file, sr=None)
+        audio = pyln.normalize.peak(audio, -1.0)
+        if audio.shape[0] < sr:
+            print('Audio shorter than 1 sec, openl3 will zero-pad it:', file, audio.shape, sr)
+        # resample to the desired evaluation bandwidth
+        audio = soxr.resample(audio, sr, samplingrate) # mono/stereo <- mono/stereo, input sr, output sr
+        # extracting stereo embeddings (dim=1024), since we concatenate L (dim=512) and R (dim=512) embeddings
+        if channels == 2:
+            if audio.ndim == 1:
+                audio_l3, sr_l3 = audio, samplingrate
+            elif audio.ndim == 2:
+                # if it's stereo separate channels for openl3
+                audio_l3 = [audio[:,0], audio[:,1]]
+                sr_l3 = [samplingrate, samplingrate]
+            emb, _ = openl3.get_audio_embedding(audio_l3, sr_l3, model=model, verbose=False, hop_size=openl3_hop_size)
+            if audio.ndim == 1:
+                # if mono audio, "fake" stereo by concatenating mono embedding as L and R embeddings
+                emb = np.concatenate([emb, emb],axis=1)
+            elif audio.ndim == 2:
+                emb = np.concatenate(emb,axis=1)
+        # or extracting mono embeddings (dim=512)
+        elif channels == 1:
+            emb, _ = openl3.get_audio_embedding(audio, samplingrate, model=model, verbose=False, hop_size=openl3_hop_size)
+        # concatenate embeddings
+        if first:
+            embeddings = emb
+            first = False
+        else:
+            embeddings = np.concatenate([embeddings, emb], axis=0)
+    # return as a list of embeddings: [np.array[], ...]
+    return [e for e in embeddings]
+def openl3_fd(channels, samplingrate, content_type, openl3_hop_size, eval_path,
+              eval_files_extension='.wav', ref_path=None, ref_files_extension='.wav', load_ref_embeddings=None, batching=False):
+    """
+    Compute the Fréchet Distance between files in eval_path and ref_path.
+    Fréchet distance computed on top of openl3 embeddings.
+    GPU-based computation.
+    Extracting the embeddings is timeconsuming. After being computed once, we store them.
+    We store pre-computed reference embedding statistics in load/openl3_fd/
+    To load those and save computation, just set the path in load_ref_embeddings.
+    If load_ref_embeddings is set, ref_path is not required.
+    Params:
+    -- channels: 1 (mono), or 2 (stereo) to get the Fréchet Distance over mono or stereo embeddings.
+    -- samplingrate: max bandwith at wich we evaluate the given signals. Up to 48kHz.
+    -- content_type: 'music' or 'env' to select a content type for openl3.
+    -- openl3_hop_size: analysis resolution of openl3 in seconds. Openl3's input window is 1 sec.
+    -- eval_path: path where the generated audio files to evaluate are available.
+    -- eval_files_extenstion: files extension (default .wav) in eval_path.
+    -- ref_path: path where the reference audio files are available. (instead of load_ref_embeddings)
+    -- ref_files_extension: files extension (default .wav) in ref_path.
+    -- load_ref_embeddings: path to the reference embedding statistics. (inestead of ref_path)
+    -- batching: set batch size (with an int) or set to False (default False).
+    Returns:
+    -- Fréchet distance.
+    """
+    if not os.path.isdir(eval_path):
+        raise ValueError('eval_path does not exist')
+    if load_ref_embeddings:
+        if not os.path.exists(load_ref_embeddings):
+            raise ValueError('load_ref_embeddings does not exist')
+        print('[LOADING REFERENCE EMBEDDINGS] ', load_ref_embeddings)
+        loaded = np.load(load_ref_embeddings)
+        mu_ref = loaded['mu_ref']
+        sigma_ref = loaded['sigma_ref']
+    else:
+        if ref_path:
+            if not os.path.isdir(ref_path):
+                if not os.path.isfile(ref_path):
+                    raise ValueError("ref_path does not exist")
+            if os.path.isfile(ref_path):
+                path = ref_path
+            else:
+                path = os.path.join(ref_path, '*'+ref_files_extension)
+            print('[EXTRACTING REFERENCE EMBEDDINGS] ', path)
+            if batching:
+                ref_embeddings = extract_embeddings(path, channels, samplingrate, content_type, openl3_hop_size, batch_size=batching)
+            else:
+                ref_embeddings = extract_embeddings_nobatching(path, channels, samplingrate, content_type, openl3_hop_size)
+            mu_ref, sigma_ref = calculate_embd_statistics(ref_embeddings)
+            # store statistics to load later on
+            if not os.path.exists('load/openl3_fd'):
+                os.makedirs('load/openl3_fd/')
+            save_ref_embeddings_path = (
+                'load/openl3_fd/' +
+                path.replace('/', '_') +
+                '__channels' + str(channels) +
+                '__' + str(samplingrate) +
+                '__openl3' + str(content_type) +
+                '__openl3hopsize' + str(openl3_hop_size) +
+                '__batch' + str(batching) +
+                '.npz'
+            )
+            np.savez(save_ref_embeddings_path, mu_ref=mu_ref, sigma_ref=sigma_ref)
+            print('[REFERENCE EMBEDDINGS][SAVED] ', save_ref_embeddings_path)
+        else:
+            raise ValueError('Must specify ref_path or load_ref_embeddings')
+    path = os.path.join(eval_path, '*'+eval_files_extension)
+    print('[EXTRACTING EVALUATION EMBEDDINGS] ', path)
+    if batching:
+        eval_embeddings = extract_embeddings(path, channels, samplingrate, content_type, openl3_hop_size, batch_size=batching)
+    else:
+        eval_embeddings = extract_embeddings_nobatching(path, channels, samplingrate, content_type, openl3_hop_size)
+    mu_eval, sigma_eval = calculate_embd_statistics(eval_embeddings)
+    fd = calculate_frechet_distance(mu_eval, sigma_eval, mu_ref, sigma_ref)
+    if load_ref_embeddings:
+        print('[FRéCHET DISTANCE] ', eval_path, load_ref_embeddings, fd)
+    else:
+        print('[FRéCHET DISTANCE] ', eval_path, ref_path, fd)
+    return fd

inspiremusic/metrics/passt_kld.py ADDED Viewed

	@@ -0,0 +1,232 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
+import os
+import contextlib
+from functools import partial
+from tqdm import tqdm
+import pickle
+import numpy as np
+import librosa
+from hear21passt.base import get_basic_model
+import pyloudnorm as pyln
+import torch
+import torch.nn.functional as F
+SAMPLING_RATE = 32000
+class _patch_passt_stft:
+    """
+    From version 1.8.0, return_complex must always be given explicitly
+    for real inputs and return_complex=False has been deprecated.
+    Decorator to patch torch.stft in PaSST that uses an old stft version.
+    Adapted from: https://github.com/facebookresearch/audiocraft/blob/a2b96756956846e194c9255d0cdadc2b47c93f1b/audiocraft/metrics/kld.py
+    """
+    def __init__(self):
+        self.old_stft = torch.stft
+    def __enter__(self):
+        # return_complex is a mandatory parameter in latest torch versions.
+        # torch is throwing RuntimeErrors when not set.
+        # see: https://pytorch.org/docs/1.7.1/generated/torch.stft.html?highlight=stft#torch.stft
+        # see: https://github.com/kkoutini/passt_hear21/commit/dce83183674e559162b49924d666c0a916dc967a
+        torch.stft = partial(torch.stft, return_complex=False)
+    def __exit__(self, *exc):
+        torch.stft = self.old_stft
+def return_probabilities(model, audio_path, window_size=10, overlap=5, collect='mean'):
+    """
+    Given an audio and the PaSST model, return the probabilities of each AudioSet class.
+    Audio is converted to mono at 32kHz.
+    PaSST model is trained with 10 sec inputs. We refer to this parameter as the window_size.
+    We set it to 10 sec for consistency with PaSST training.
+    For longer audios, we split audio into overlapping analysis windows of window_size and overlap of 10 and 5 seconds.
+    PaSST supports 10, 20 or 30 sec inputs. Not longer inputs: https://github.com/kkoutini/PaSST/issues/19
+    Note that AudioSet taggers normally use sigmoid output layers. Yet, to compute the
+    KL we work with normalized probabilities by running a softmax over logits as in MusicGen:
+    https://github.com/facebookresearch/audiocraft/blob/a2b96756956846e194c9255d0cdadc2b47c93f1b/audiocraft/metrics/kld.py
+    This implementation assumes run will be on GPU.
+    Params:
+    -- model: PaSST model on a GPU.
+    -- audio_path: path to the audio to be loaded with librosa.
+    -- window_size (default=10 sec): analysis window (and receptive field) of PaSST.
+    -- overlap (default=5 sec): overlap of the running analysis window for inputs longar than window_size (10 sec).
+    -- collect (default='mean'): for longer inputs, aggregate/collect via 'mean' or 'max' pooling along logits vector.
+    Returns:
+    --  527 probabilities (after softmax, no logarithm).
+    """
+    # load the audio using librosa
+    audio, _ = librosa.load(audio_path, sr=SAMPLING_RATE, mono=True)
+    audio = pyln.normalize.peak(audio, -1.0)
+    # calculate the step size for the analysis windows with the specified overlap
+    step_size = int((window_size - overlap) * SAMPLING_RATE)
+    # iterate over the audio, creating analysis windows
+    probabilities = []
+    for i in range(0, max(step_size, len(audio) - step_size), step_size):
+        # extract the current analysis window
+        window = audio[i:i + int(window_size * SAMPLING_RATE)]
+        # pad the window with zeros if it's shorter than the desired window size
+        if len(window) < int(window_size * SAMPLING_RATE):
+            # discard window if it's too small (avoid mostly zeros predicted as silence), as in MusicGen:
+            # https://github.com/facebookresearch/audiocraft/blob/a2b96756956846e194c9255d0cdadc2b47c93f1b/audiocraft/metrics/kld.py
+            if len(window) > int(window_size * SAMPLING_RATE * 0.15):
+                tmp = np.zeros(int(window_size * SAMPLING_RATE))
+                tmp[:len(window)] = window
+                window = tmp
+        # convert to a PyTorch tensor and move to GPU
+        audio_wave = torch.from_numpy(window.astype(np.float32)).unsqueeze(0).cuda()
+        # get the probabilities for this analysis window
+        with open(os.devnull, 'w') as f, contextlib.redirect_stdout(f):
+            with torch.no_grad(), _patch_passt_stft():
+                logits = model(audio_wave)
+                probabilities.append(torch.squeeze(logits))
+    probabilities = torch.stack(probabilities)
+    if collect == 'mean':
+        probabilities = torch.mean(probabilities, dim=0)
+    elif collect == 'max':
+        probabilities, _ = torch.max(probabilities, dim=0)
+    return F.softmax(probabilities, dim=0).squeeze().cpu()
+def passt_kld(ids, eval_path, eval_files_extension='.wav', ref_path=None, ref_files_extension='.wav', load_ref_probabilities=None, no_ids=[], collect='mean'):
+    """
+    Compute KL-divergence between the label probabilities of the generated audio with respect to the original audio.
+    Both generated audio (in eval_path) and original audio (in ref_path) are represented by the same prompt/description.
+    Audios are identified by an id, that is the name of the file in both directories and links the audio with the prompt/description.
+    segmenting the audio
+    For inputs longer that the 10 sec PaSST was trained on, we aggregate/collect via 'mean' (default) or 'max' pooling along the logits vector.
+    We split the inpot into overlapping analysis windows. Subsequently, we aggregate/collect (accross windows) the generated logits and then apply a softmax.
+    This evaluation script assumes that ids are in both ref_path and eval_path.
+    We label probabilities via the PaSST model: https://github.com/kkoutini/PaSST
+    GPU-based computation.
+    Extracting the probabilities is timeconsuming. After being computed once, we store them.
+    We store pre-computed reference probabilities in load/
+    To load those and save computation, just set the path in load_ref_probabilities.
+    If load_ref_probabilities is set, ref_path is not required.
+    Params:
+    -- ids: list of ids present in both eval_path and ref_path.
+    -- eval_path: path where the generated audio files to evaluate are available.
+    -- eval_files_extenstion: files extension (default .wav) in eval_path.
+    -- ref_path: path where the reference audio files are available. (instead of load_ref_probabilities)
+    -- ref_files_extenstion: files extension (default .wav) in ref_path.
+    -- load_ref_probabilities: path to the reference probabilities. (inestead of ref_path)
+    -- no_ids: it is possible that some reference audio is corrupted or not present. Ignore some this list of ids.
+    -- collect (default='mean'): for longer inputs, aggregate/collect via 'mean' or 'max' pooling along the logits vector.
+    Returns:
+    -- KL divergence
+    """
+    with open(os.devnull, 'w') as f, contextlib.redirect_stdout(f): # capturing all useless outputs from passt
+        # load model
+        model = get_basic_model(mode="logits")
+        model.eval()
+        model = model.cuda()
+    if not os.path.isdir(eval_path):
+        if not os.path.isfile(eval_path):
+            raise ValueError('eval_path does not exist')
+    if load_ref_probabilities:
+        if not os.path.exists(load_ref_probabilities):
+            raise ValueError('load_ref_probabilities does not exist')
+        print('[LOADING REFERENCE PROBABILITIES] ', load_ref_probabilities)
+        with open(load_ref_probabilities, 'rb') as fp:
+            ref_p = pickle.load(fp)
+    else:
+        if ref_path:
+            if not os.path.isdir(ref_path):
+                if os.path.isfile(ref_path):
+                    id2utt = {}
+                    with open(ref_path, "r") as f:
+                        for line in f:
+                            sec = line.strip().split(" ")
+                            id2utt[sec[0]] = sec[1]
+                    f.close()
+                else:
+                    raise ValueError("ref_path does not exist")
+            print('[EXTRACTING REFERENCE PROBABILITIES] ', ref_path)
+            ref_p = {}
+            for id in tqdm(ids):
+                if id not in no_ids:
+                    try:
+                        if os.path.isfile(ref_path):
+                            if id in id2utt.keys():
+                                audio_path = id2utt[id]
+                            else:
+                                raise ValueError(f"id: {id} not in {ref_path}!")
+                        else:
+                            audio_path = os.path.join(ref_path, str(id)+ref_files_extension)
+                        if os.path.isfile(audio_path):
+                            ref_p[id] = return_probabilities(model, audio_path, collect=collect)
+                    except Exception as e:
+                        print(f"An unexpected error occurred with {id}: {e}\nIf you failed to download it you can add it to no_ids list.")
+            # store reference probabilities to load later on
+            if not os.path.exists('load/passt_kld/'):
+                os.makedirs('load/passt_kld/')
+            save_ref_probabilities_path = 'load/passt_kld/'+ref_path.replace('/', '_')+'_collect'+str(collect)+'__reference_probabilities.pkl'
+            with open(save_ref_probabilities_path, 'wb') as fp:
+                pickle.dump(ref_p, fp)
+            print('[REFERENCE EMBEDDINGS][SAVED] ', save_ref_probabilities_path)
+        else:
+            raise ValueError('Must specify ref_path or load_ref_probabilities')
+    print('[EVALUATING GENERATIONS] ', eval_path)
+    passt_kl = 0
+    count = 0
+    for id in tqdm(ids):
+        if id not in no_ids:
+            try:
+                audio_path = os.path.join(eval_path, str(id)+eval_files_extension)
+                if os.path.isfile(audio_path):
+                    eval_p = return_probabilities(model, audio_path, collect=collect)
+                    # note: F.kl_div(x, y) is KL(y||x)
+                    # see: https://github.com/pytorch/pytorch/issues/7337
+                    # see: https://discuss.pytorch.org/t/kl-divergence-different-results-from-tf/56903/2
+                    passt_kl += F.kl_div((ref_p[id] + 1e-6).log(), eval_p, reduction='sum', log_target=False)
+                    count += 1
+            except Exception as e:
+                print(f"An unexpected error occurred with {id}: {e}\nIf you failed to download it you can add it to no_ids list.")
+    return passt_kl / count if count > 0 else 0

inspiremusic/music_tokenizer/__init__.py ADDED Viewed

File without changes

inspiremusic/music_tokenizer/env.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import shutil
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+def build_env(config, config_name, path):
+    t_path = os.path.join(path, config_name)
+    if config != t_path:
+        os.makedirs(path, exist_ok=True)
+        shutil.copyfile(config, os.path.join(path, config_name))

inspiremusic/music_tokenizer/meldataset.py ADDED Viewed

	@@ -0,0 +1,226 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# code based on https://github.com/b04901014/MQTTS
+import math
+import os
+import random
+import librosa
+import numpy as np
+import torch.utils.data
+from librosa.filters import mel as librosa_mel_fn
+def load_wav(full_path, sr):
+    wav, sr = librosa.load(full_path, sr=sr)
+    return wav, sr
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
+def dynamic_range_decompression(x, C=1):
+    return np.exp(x) / C
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression_torch(x, C=1):
+    return torch.exp(x) / C
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+mel_basis = {}
+hann_window = {}
+## modified to get stft with return complex value = True for pytorch ver2.0
+def mel_spectrogram(y,
+                    n_fft,
+                    num_mels,
+                    sampling_rate,
+                    hop_size,
+                    win_size,
+                    fmin,
+                    fmax,
+                    center=False):
+    global mel_basis, hann_window
+    if fmax not in mel_basis:
+        mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
+        mel_basis[str(fmax) + '_' +
+                  str(y.device)] = torch.from_numpy(mel).float().to(y.device)
+        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1), (int((n_fft - hop_size) / 2), int(
+            (n_fft - hop_size) / 2)),
+        mode='reflect')
+    y = y.squeeze(1)
+    spec = torch.view_as_real(torch.stft(
+        y,
+        n_fft,
+        hop_length=hop_size,
+        win_length=win_size,
+        window=hann_window[str(y.device)],
+        center=center,
+        pad_mode='reflect',
+        normalized=False,
+        onesided=True,
+        return_complex=True
+    ))
+    spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
+    spec = torch.matmul(mel_basis[str(fmax) + '_' + str(y.device)], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec
+def get_dataset_filelist(a):
+    with open(a.input_training_file, 'r') as f:
+        training_files = [l.strip() for l in f]
+    with open(a.input_validation_file, 'r') as f:
+        validation_files = [l.strip() for l in f]
+    return training_files, validation_files
+class MelDataset(torch.utils.data.Dataset):
+    def __init__(self,
+                 training_files,
+                 segment_size,
+                 n_fft,
+                 num_mels,
+                 hop_size,
+                 win_size,
+                 sampling_rate,
+                 fmin,
+                 fmax,
+                 split=True,
+                 shuffle=True,
+                 n_cache_reuse=1,
+                 device=None,
+                 fmax_loss=None,
+                 fine_tuning=False,
+                 base_mels_path=None):
+        self.audio_files = training_files
+        random.seed(1234)
+        if shuffle:
+            random.shuffle(self.audio_files)
+        self.segment_size = segment_size
+        self.sampling_rate = sampling_rate
+        self.split = split
+        self.n_fft = n_fft
+        self.num_mels = num_mels
+        self.hop_size = hop_size
+        self.win_size = win_size
+        self.fmin = fmin
+        self.fmax = fmax
+        self.fmax_loss = fmax_loss
+        self.cached_wav = None
+        self.n_cache_reuse = n_cache_reuse
+        self._cache_ref_count = 0
+        self.device = device
+        self.fine_tuning = fine_tuning
+        self.base_mels_path = base_mels_path
+    def __getitem__(self, index):
+        filename = self.audio_files[index]
+        if self._cache_ref_count == 0:
+            try:
+                # Note by yuantian: load with the sample_rate of config
+                audio, sampling_rate = load_wav(filename, sr=self.sampling_rate)
+            except Exception as e:
+                print(f"Error on audio: {filename}")
+                audio = np.random.normal(size=(160000, )) * 0.05
+                sampling_rate = self.sampling_rate
+            self.cached_wav = audio
+            if sampling_rate != self.sampling_rate:
+                raise ValueError("{} SR doesn't match target {} SR".format(
+                    sampling_rate, self.sampling_rate))
+            self._cache_ref_count = self.n_cache_reuse
+        else:
+            audio = self.cached_wav
+            self._cache_ref_count -= 1
+        audio = torch.FloatTensor(audio)
+        audio = audio.unsqueeze(0)
+        if not self.fine_tuning:
+            if self.split:
+                if audio.size(1) >= self.segment_size:
+                    max_audio_start = audio.size(1) - self.segment_size
+                    audio_start = random.randint(0, max_audio_start)
+                    audio = audio[:, audio_start:audio_start +
+                                  self.segment_size]
+                else:
+                    audio = torch.nn.functional.pad(audio, (
+                        0, self.segment_size - audio.size(1)), 'constant')
+            mel = mel_spectrogram(
+                audio,
+                self.n_fft,
+                self.num_mels,
+                self.sampling_rate,
+                self.hop_size,
+                self.win_size,
+                self.fmin,
+                self.fmax,
+                center=False)
+        else:
+            mel = np.load(
+                os.path.join(self.base_mels_path,
+                             os.path.splitext(os.path.split(filename)[-1])[0] +
+                             '.npy'))
+            mel = torch.from_numpy(mel)
+            if len(mel.shape) < 3:
+                mel = mel.unsqueeze(0)
+            if self.split:
+                frames_per_seg = math.ceil(self.segment_size / self.hop_size)
+                if audio.size(1) >= self.segment_size:
+                    mel_start = random.randint(0,
+                                               mel.size(2) - frames_per_seg - 1)
+                    mel = mel[:, :, mel_start:mel_start + frames_per_seg]
+                    audio = audio[:, mel_start * self.hop_size:(
+                        mel_start + frames_per_seg) * self.hop_size]
+                else:
+                    mel = torch.nn.functional.pad(mel, (
+                        0, frames_per_seg - mel.size(2)), 'constant')
+                    audio = torch.nn.functional.pad(audio, (
+                        0, self.segment_size - audio.size(1)), 'constant')
+        mel_loss = mel_spectrogram(
+            audio,
+            self.n_fft,
+            self.num_mels,
+            self.sampling_rate,
+            self.hop_size,
+            self.win_size,
+            self.fmin,
+            self.fmax_loss,
+            center=False)
+        return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
+    def __len__(self):
+        return len(self.audio_files)

inspiremusic/music_tokenizer/models.py ADDED Viewed

	@@ -0,0 +1,548 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import AvgPool1d
+from torch.nn import Conv1d
+from torch.nn import Conv2d
+from torch.nn import ConvTranspose1d
+from torch.nn.utils import remove_weight_norm
+from torch.nn.utils import spectral_norm
+from torch.nn.utils import weight_norm
+from inspiremusic.utils.tokenizer_utils import get_padding
+from inspiremusic.utils.tokenizer_utils import init_weights
+LRELU_SLOPE = 0.1
+class ResBlock1(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.h = h
+        self.convs1 = nn.ModuleList([
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[0],
+                    padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[1],
+                    padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[2],
+                    padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList([
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=1,
+                    padding=get_padding(kernel_size, 1))), weight_norm(
+                        Conv1d(
+                            channels,
+                            channels,
+                            kernel_size,
+                            1,
+                            dilation=1,
+                            padding=get_padding(kernel_size, 1))), weight_norm(
+                                Conv1d(
+                                    channels,
+                                    channels,
+                                    kernel_size,
+                                    1,
+                                    dilation=1,
+                                    padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+class ResBlock2(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.h = h
+        self.convs = nn.ModuleList([
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[0],
+                    padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[1],
+                    padding=get_padding(kernel_size, dilation[1])))
+        ])
+        self.convs.apply(init_weights)
+    def forward(self, x):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+class Generator(torch.nn.Module):
+    def __init__(self, h):
+        super(Generator, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        self.conv_pre = weight_norm(
+            Conv1d(512, h.upsample_initial_channel, 7, 1, padding=3))
+        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u,
+                k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        h.upsample_initial_channel // (2**i),
+                        h.upsample_initial_channel // (2**(i + 1)),
+                        k,
+                        u,
+                        # padding=(u//2 + u%2),
+                        padding=(k - u) // 2,
+                        # output_padding=u%2
+                    )))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel // (2**(i + 1))
+            for j, (k, d) in enumerate(
+                    zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
+                self.resblocks.append(resblock(h, ch, k, d))
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+    def forward(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x, LRELU_SLOPE)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3,
+                 use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(
+                Conv2d(
+                    1,
+                    32, (kernel_size, 1), (stride, 1),
+                    padding=(get_padding(5, 1), 0))),
+            norm_f(
+                Conv2d(
+                    32,
+                    128, (kernel_size, 1), (stride, 1),
+                    padding=(get_padding(5, 1), 0))),
+            norm_f(
+                Conv2d(
+                    128,
+                    512, (kernel_size, 1), (stride, 1),
+                    padding=(get_padding(5, 1), 0))),
+            norm_f(
+                Conv2d(
+                    512,
+                    1024, (kernel_size, 1), (stride, 1),
+                    padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
+        ])
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiPeriodDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList([
+            DiscriminatorP(2),
+            DiscriminatorP(3),
+            DiscriminatorP(5),
+            DiscriminatorP(7),
+            DiscriminatorP(11),
+        ])
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv1d(1, 128, 15, 1, padding=7)),
+            norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
+            norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
+            norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
+            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+        ])
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiScaleDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiScaleDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList([
+            DiscriminatorS(use_spectral_norm=True),
+            DiscriminatorS(),
+            DiscriminatorS(),
+        ])
+        self.meanpools = nn.ModuleList(
+            [AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)])
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            if i != 0:
+                y = self.meanpools[i - 1](y)
+                y_hat = self.meanpools[i - 1](y_hat)
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+def feature_loss(fmap_r, fmap_g):
+    loss = 0
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            loss += torch.mean(torch.abs(rl - gl))
+    return loss * 2
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    loss = 0
+    r_losses = []
+    g_losses = []
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        r_loss = torch.mean((1 - dr)**2)
+        g_loss = torch.mean(dg**2)
+        loss += (r_loss + g_loss)
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+    return loss, r_losses, g_losses
+def generator_loss(disc_outputs):
+    loss = 0
+    gen_losses = []
+    for dg in disc_outputs:
+        l = torch.mean((1 - dg)**2)
+        gen_losses.append(l)
+        loss += l
+    return loss, gen_losses
+class Encoder(torch.nn.Module):
+    def __init__(self, h):
+        super(Encoder, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        self.conv_pre = weight_norm(Conv1d(1, 32, 7, 1, padding=3))
+        self.normalize = nn.ModuleList()
+        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(
+                list(
+                    reversed(
+                        list(zip(h.upsample_rates, h.upsample_kernel_sizes))))):
+            self.ups.append(
+                weight_norm(
+                    Conv1d(
+                        32 * (2**i),
+                        32 * (2**(i + 1)),
+                        k,
+                        u,
+                        padding=((k - u) // 2)
+                        # padding=(u//2 + u%2)
+                    )))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = 32 * (2**(i + 1))
+            for j, (k, d) in enumerate(
+                    zip(
+                        list(reversed(h.resblock_kernel_sizes)),
+                        list(reversed(h.resblock_dilation_sizes)))):
+                self.resblocks.append(resblock(h, ch, k, d))
+                self.normalize.append(
+                    torch.nn.GroupNorm(ch // 16, ch, eps=1e-6, affine=True))
+        self.conv_post = Conv1d(512, 512, 3, 1, padding=1)
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+    def forward(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                    xs = self.normalize[i * self.num_kernels + j](xs)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+                    xs = self.normalize[i * self.num_kernels + j](xs)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        return x
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+class Quantizer_module(torch.nn.Module):
+    def __init__(self, n_e, e_dim):
+        super(Quantizer_module, self).__init__()
+        self.embedding = nn.Embedding(n_e, e_dim)
+        self.embedding.weight.data.uniform_(-1.0 / n_e, 1.0 / n_e)
+    def forward(self, x):
+        # compute Euclidean distance
+        d = torch.sum(x ** 2, 1, keepdim=True) + torch.sum(self.embedding.weight ** 2, 1) \
+            - 2 * torch.matmul(x, self.embedding.weight.T)
+        min_indicies = torch.argmin(d, 1)
+        z_q = self.embedding(min_indicies)
+        return z_q, min_indicies
+class Quantizer(torch.nn.Module):
+    def __init__(self, h):
+        super(Quantizer, self).__init__()
+        assert 512 % h.n_code_groups == 0
+        self.quantizer_modules = nn.ModuleList([
+            Quantizer_module(h.n_codes, 512 // h.n_code_groups)
+            for _ in range(h.n_code_groups)
+        ])
+        self.quantizer_modules2 = nn.ModuleList([
+            Quantizer_module(h.n_codes, 512 // h.n_code_groups)
+            for _ in range(h.n_code_groups)
+        ])
+        self.h = h
+        self.codebook_loss_lambda = self.h.codebook_loss_lambda  # e.g., 1
+        self.commitment_loss_lambda = self.h.commitment_loss_lambda  # e.g., 0.25
+        self.residul_layer = 2
+        self.n_code_groups = h.n_code_groups
+    def for_one_step(self, xin, idx):
+        xin = xin.transpose(1, 2)
+        x = xin.reshape(-1, 512)
+        x = torch.split(x, 512 // self.h.n_code_groups, dim=-1)
+        min_indicies = []
+        z_q = []
+        if idx == 0:
+            for _x, m in zip(x, self.quantizer_modules):
+                _z_q, _min_indicies = m(_x)
+                z_q.append(_z_q)
+                min_indicies.append(_min_indicies)  #B * T,
+            z_q = torch.cat(z_q, -1).reshape(xin.shape)
+            # loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean((z_q - xin.detach()) ** 2)
+            loss = self.codebook_loss_lambda * torch.mean((z_q - xin.detach()) ** 2) \
+                + self.commitment_loss_lambda * torch.mean((z_q.detach() - xin) ** 2)
+            z_q = xin + (z_q - xin).detach()
+            z_q = z_q.transpose(1, 2)
+            return z_q, loss, min_indicies
+        else:
+            for _x, m in zip(x, self.quantizer_modules2):
+                _z_q, _min_indicies = m(_x)
+                z_q.append(_z_q)
+                min_indicies.append(_min_indicies)  #B * T,
+            z_q = torch.cat(z_q, -1).reshape(xin.shape)
+            # loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean((z_q - xin.detach()) ** 2)
+            loss = self.codebook_loss_lambda * torch.mean((z_q - xin.detach()) ** 2) \
+                + self.commitment_loss_lambda * torch.mean((z_q.detach() - xin) ** 2)
+            z_q = xin + (z_q - xin).detach()
+            z_q = z_q.transpose(1, 2)
+            return z_q, loss, min_indicies
+    def forward(self, xin):
+        #B, C, T
+        quantized_out = 0.0
+        residual = xin
+        all_losses = []
+        all_indices = []
+        for i in range(self.residul_layer):
+            quantized, loss, indices = self.for_one_step(residual, i)  #
+            residual = residual - quantized
+            quantized_out = quantized_out + quantized
+            all_indices.extend(indices)  #
+            all_losses.append(loss)
+        all_losses = torch.stack(all_losses)
+        loss = torch.mean(all_losses)
+        return quantized_out, loss, all_indices
+    def embed(self, x):
+        #idx: N, T, 4
+        #print('x ', x.shape)
+        quantized_out = torch.tensor(0.0, device=x.device)
+        x = torch.split(x, 1, 2)  # split, 将最后一个维度分开, 每个属于一个index group
+        #print('x.shape ', len(x),x[0].shape)
+        for i in range(self.residul_layer):
+            ret = []
+            if i == 0:
+                for j in range(self.n_code_groups):
+                    q = x[j]
+                    embed = self.quantizer_modules[j]
+                    q = embed.embedding(q.squeeze(-1).long())
+                    ret.append(q)
+                ret = torch.cat(ret, -1)
+                #print(ret.shape)
+                quantized_out = quantized_out + ret
+            else:
+                for j in range(self.n_code_groups):
+                    q = x[j + self.n_code_groups]
+                    embed = self.quantizer_modules2[j]
+                    q = embed.embedding(q.squeeze(-1).long())
+                    ret.append(q)
+                ret = torch.cat(ret, -1)
+                quantized_out = quantized_out + ret
+        return quantized_out.transpose(1, 2)  #N, C, T

inspiremusic/music_tokenizer/vqvae.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import torch
+import torch.nn as nn
+from inspiremusic.music_tokenizer.env import AttrDict
+from inspiremusic.music_tokenizer.models import Encoder
+from inspiremusic.music_tokenizer.models import Generator
+from inspiremusic.music_tokenizer.models import Quantizer
+class VQVAE(nn.Module):
+    def __init__(self,
+                 config_path,
+                 ckpt_path,
+                 with_encoder=False):
+        super(VQVAE, self).__init__()
+        ckpt = torch.load(ckpt_path)
+        with open(config_path) as f:
+            data = f.read()
+        json_config = json.loads(data)
+        self.h = AttrDict(json_config)
+        self.quantizer = Quantizer(self.h)
+        self.generator = Generator(self.h)
+        self.generator.load_state_dict(ckpt['generator'])
+        self.quantizer.load_state_dict(ckpt['quantizer'])
+        if with_encoder:
+            self.encoder = Encoder(self.h)
+            self.encoder.load_state_dict(ckpt['encoder'])
+    def forward(self, x):
+        # x is the codebook
+        # x.shape (B, T, Nq)
+        quant_emb = self.quantizer.embed(x)
+        return self.generator(quant_emb)
+    def encode(self, x):
+        batch_size = x.size(0)
+        if len(x.shape) == 3 and x.shape[-1] == 1:
+            x = x.squeeze(-1)
+        c = self.encoder(x.unsqueeze(1))
+        q, loss_q, c = self.quantizer(c)
+        c = [code.reshape(batch_size, -1) for code in c]
+        # shape: [N, T, 4]
+        return torch.stack(c, -1)

inspiremusic/text/abs_tokenizer.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABC
+from abc import abstractmethod
+from typing import Iterable
+from typing import List
+class AbsTokenizer(ABC):
+    @abstractmethod
+    def text2tokens(self, line: str) -> List[str]:
+        raise NotImplementedError
+    @abstractmethod
+    def tokens2text(self, tokens: Iterable[str]) -> str:
+        raise NotImplementedError
+    def encode(self, line: str, **kwargs) -> List[str]:
+        return self.text2tokens(line)

inspiremusic/text/tokenizer.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import os
+import re
+from typing import Iterable, List, Union
+import numpy as np
+import torch
+from inspiremusic.text.abs_tokenizer import AbsTokenizer
+from transformers import AutoTokenizer
+def get_tokenizer(tokenizer_name, tokenizer_path):
+    if "qwen" in tokenizer_name:
+        return QwenTokenizer(tokenizer_path,skip_special_tokens=True)
+    else:
+        return None
+class QwenTokenizer(AbsTokenizer):
+    def __init__(
+            self,
+            token_path: str,
+            skip_special_tokens: bool = True,
+    ):
+        super().__init__()
+        # NOTE: non-chat model, all these special tokens keep randomly initialized.
+        special_tokens = {
+            'eos_token': '<|endoftext|>',
+            'pad_token': '<|endoftext|>',
+            'additional_special_tokens': [
+                '<|im_start|>', '<|im_end|>', '<|endofprompt|>',
+                '[breath]', '<strong>', '</strong>', '[noise]',
+                '[laughter]', '[cough]', '[clucking]', '[accent]',
+                '[quick_breath]',
+            ]
+        }
+        self.tokenizer = AutoTokenizer.from_pretrained(token_path)
+        self.tokenizer.add_special_tokens(special_tokens)
+        self.skip_special_tokens = skip_special_tokens
+    def get_vocab_size(self):
+        return self.tokenizer.vocab_size
+    def text2tokens(self, line: str) -> List:
+        tokens = self.tokenizer([line], return_tensors="pt")
+        tokens = tokens["input_ids"][0].cpu().tolist()
+        return tokens
+    def tokens2text(self, tokens) -> str:
+        tokens = torch.tensor(tokens, dtype=torch.int64)
+        text = self.tokenizer.batch_decode([tokens], skip_special_tokens=self.skip_special_tokens)[0]
+        return text
+def get_qwen_vocab_size(token_type: str):
+    if "qwen1.5" in token_type.lower() or "qwen2.0" in token_type.lower() or "qwen2.5" in token_type.lower():
+        # 293 for special and extra tokens, including endoftext, im_start, im_end, endofprompt and others in the future.
+        # model.vocab_size = 151936, tokenizer.vocab_size = 151643
+        # NOTE: the first three special tokens (endoftext, im_start, im_end) are trained in Chat series models,
+        # others are kept in random initialization state.
+        return 151643 + 293
+    else:
+        raise ValueError(f"Unknown tokenizer {token_type}")

inspiremusic/transformer/__init__.py ADDED Viewed

File without changes

inspiremusic/transformer/activation.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe)
+#               2020 Northwestern Polytechnical University (Pengcheng Guo)
+#               2020 Mobvoi Inc (Binbin Zhang)
+#               2024 Alibaba Inc (Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Swish() activation function for Conformer."""
+import torch
+from torch import nn, sin, pow
+from torch.nn import Parameter
+class Swish(torch.nn.Module):
+    """Construct an Swish object."""
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Return Swish activation function."""
+        return x * torch.sigmoid(x)
+# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
+#   LICENSE is in incl_licenses directory.
+class Snake(nn.Module):
+    '''
+    Implementation of a sine-based periodic activation function
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter
+    References:
+        - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snake(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    '''
+    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+        '''
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha: trainable parameter
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            alpha will be trained along with the rest of your model.
+        '''
+        super(Snake, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+        else:  # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        '''
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        Snake ∶= x + 1/a * sin^2 (xa)
+        '''
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+        x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+        return x

inspiremusic/transformer/attention.py ADDED Viewed

	@@ -0,0 +1,328 @@

+# Copyright (c) 2019 Shigeki Karita
+#               2020 Mobvoi Inc (Binbin Zhang)
+#               2022 Xingchen Song ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Multi-Head Attention layer definition."""
+import math
+from typing import Tuple
+import torch
+from torch import nn
+class MultiHeadedAttention(nn.Module):
+    """Multi-Head Attention layer.
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self,
+                 n_head: int,
+                 n_feat: int,
+                 dropout_rate: float,
+                 key_bias: bool = True):
+        """Construct an MultiHeadedAttention object."""
+        super().__init__()
+        assert n_feat % n_head == 0
+        # We assume d_v always equals d_k
+        self.d_k = n_feat // n_head
+        self.h = n_head
+        self.linear_q = nn.Linear(n_feat, n_feat)
+        self.linear_k = nn.Linear(n_feat, n_feat, bias=key_bias)
+        self.linear_v = nn.Linear(n_feat, n_feat)
+        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.dropout = nn.Dropout(p=dropout_rate)
+    def forward_qkv(
+        self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Transform query, key and value.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+        Returns:
+            torch.Tensor: Transformed query tensor, size
+                (#batch, n_head, time1, d_k).
+            torch.Tensor: Transformed key tensor, size
+                (#batch, n_head, time2, d_k).
+            torch.Tensor: Transformed value tensor, size
+                (#batch, n_head, time2, d_k).
+        """
+        n_batch = query.size(0)
+        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
+        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
+        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
+        q = q.transpose(1, 2)  # (batch, head, time1, d_k)
+        k = k.transpose(1, 2)  # (batch, head, time2, d_k)
+        v = v.transpose(1, 2)  # (batch, head, time2, d_k)
+        return q, k, v
+    def forward_attention(
+        self,
+        value: torch.Tensor,
+        scores: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool)
+    ) -> torch.Tensor:
+        """Compute attention context vector.
+        Args:
+            value (torch.Tensor): Transformed value, size
+                (#batch, n_head, time2, d_k).
+            scores (torch.Tensor): Attention score, size
+                (#batch, n_head, time1, time2).
+            mask (torch.Tensor): Mask, size (#batch, 1, time2) or
+                (#batch, time1, time2), (0, 0, 0) means fake mask.
+        Returns:
+            torch.Tensor: Transformed value (#batch, time1, d_model)
+                weighted by the attention score (#batch, time1, time2).
+        """
+        n_batch = value.size(0)
+        # NOTE(xcsong): When will `if mask.size(2) > 0` be True?
+        #   1. onnx(16/4) [WHY? Because we feed real cache & real mask for the
+        #           1st chunk to ease the onnx export.]
+        #   2. pytorch training
+        if mask.size(2) > 0:  # time2 > 0
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
+            # For last chunk, time2 might be larger than scores.size(-1)
+            mask = mask[:, :, :, :scores.size(-1)]  # (batch, 1, *, time2)
+            scores = scores.masked_fill(mask, -float('inf'))
+            attn = torch.softmax(scores, dim=-1).masked_fill(
+                mask, 0.0)  # (batch, head, time1, time2)
+        # NOTE(xcsong): When will `if mask.size(2) > 0` be False?
+        #   1. onnx(16/-1, -1/-1, 16/0)
+        #   2. jit (16/-1, -1/-1, 16/0, 16/4)
+        else:
+            attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+        p_attn = self.dropout(attn)
+        x = torch.matmul(p_attn, value)  # (batch, head, time1, d_k)
+        x = (x.transpose(1, 2).contiguous().view(n_batch, -1,
+                                                 self.h * self.d_k)
+             )  # (batch, time1, d_model)
+        return self.linear_out(x)  # (batch, time1, d_model)
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        pos_emb: torch.Tensor = torch.empty(0),
+        cache: torch.Tensor = torch.zeros((0, 0, 0, 0))
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute scaled dot product attention.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+                1.When applying cross attention between decoder and encoder,
+                the batch padding mask for input is in (#batch, 1, T) shape.
+                2.When applying self attention of encoder,
+                the mask is in (#batch, T, T)  shape.
+                3.When applying self attention of decoder,
+                the mask is in (#batch, L, L)  shape.
+                4.If the different position in decoder see different block
+                of the encoder, such as Mocha, the passed in mask could be
+                in (#batch, L, T) shape. But there is no such case in current
+                InspireMusic.
+            cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+            torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        # NOTE(xcsong):
+        #   when export onnx model, for 1st chunk, we feed
+        #       cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
+        #       or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
+        #       In all modes, `if cache.size(0) > 0` will alwayse be `True`
+        #       and we will always do splitting and
+        #       concatnation(this will simplify onnx export). Note that
+        #       it's OK to concat & split zero-shaped tensors(see code below).
+        #   when export jit  model, for 1st chunk, we always feed
+        #       cache(0, 0, 0, 0) since jit supports dynamic if-branch.
+        # >>> a = torch.ones((1, 2, 0, 4))
+        # >>> b = torch.ones((1, 2, 3, 4))
+        # >>> c = torch.cat((a, b), dim=2)
+        # >>> torch.equal(b, c)        # True
+        # >>> d = torch.split(a, 2, dim=-1)
+        # >>> torch.equal(d[0], d[1])  # True
+        if cache.size(0) > 0:
+            key_cache, value_cache = torch.split(cache,
+                                                 cache.size(-1) // 2,
+                                                 dim=-1)
+            k = torch.cat([key_cache, k], dim=2)
+            v = torch.cat([value_cache, v], dim=2)
+        # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
+        #   non-trivial to calculate `next_cache_start` here.
+        new_cache = torch.cat((k, v), dim=-1)
+        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
+        return self.forward_attention(v, scores, mask), new_cache
+class RelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """Multi-Head Attention layer with relative position encoding.
+    Paper: https://arxiv.org/abs/1901.02860
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self,
+                 n_head: int,
+                 n_feat: int,
+                 dropout_rate: float,
+                 key_bias: bool = True):
+        """Construct an RelPositionMultiHeadedAttention object."""
+        super().__init__(n_head, n_feat, dropout_rate, key_bias)
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        torch.nn.init.xavier_uniform_(self.pos_bias_u)
+        torch.nn.init.xavier_uniform_(self.pos_bias_v)
+    def rel_shift(self, x: torch.Tensor) -> torch.Tensor:
+        """Compute relative positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1).
+            time1 means the length of query vector.
+        Returns:
+            torch.Tensor: Output tensor.
+        """
+        zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1),
+                               device=x.device,
+                               dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+        x_padded = x_padded.view(x.size()[0],
+                                 x.size()[1],
+                                 x.size(3) + 1, x.size(2))
+        x = x_padded[:, :, 1:].view_as(x)[
+            :, :, :, : x.size(-1) // 2 + 1
+        ]  # only keep the positions from 0 to time2
+        return x
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        pos_emb: torch.Tensor = torch.empty(0),
+        cache: torch.Tensor = torch.zeros((0, 0, 0, 0))
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2), (0, 0, 0) means fake mask.
+            pos_emb (torch.Tensor): Positional embedding tensor
+                (#batch, time2, size).
+            cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+            torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        q = q.transpose(1, 2)  # (batch, time1, head, d_k)
+        # NOTE(xcsong):
+        #   when export onnx model, for 1st chunk, we feed
+        #       cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
+        #       or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
+        #       In all modes, `if cache.size(0) > 0` will alwayse be `True`
+        #       and we will always do splitting and
+        #       concatnation(this will simplify onnx export). Note that
+        #       it's OK to concat & split zero-shaped tensors(see code below).
+        #   when export jit  model, for 1st chunk, we always feed
+        #       cache(0, 0, 0, 0) since jit supports dynamic if-branch.
+        # >>> a = torch.ones((1, 2, 0, 4))
+        # >>> b = torch.ones((1, 2, 3, 4))
+        # >>> c = torch.cat((a, b), dim=2)
+        # >>> torch.equal(b, c)        # True
+        # >>> d = torch.split(a, 2, dim=-1)
+        # >>> torch.equal(d[0], d[1])  # True
+        if cache.size(0) > 0:
+            key_cache, value_cache = torch.split(cache,
+                                                 cache.size(-1) // 2,
+                                                 dim=-1)
+            k = torch.cat([key_cache, k], dim=2)
+            v = torch.cat([value_cache, v], dim=2)
+        # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
+        #   non-trivial to calculate `next_cache_start` here.
+        new_cache = torch.cat((k, v), dim=-1)
+        n_batch_pos = pos_emb.size(0)
+        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
+        p = p.transpose(1, 2)  # (batch, head, time1, d_k)
+        # (batch, head, time1, d_k)
+        q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
+        # (batch, head, time1, d_k)
+        q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch, head, time1, time2)
+        matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
+        # compute matrix b and matrix d
+        # (batch, head, time1, time2)
+        matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
+        # NOTE(Xiang Lyu): Keep rel_shift since espnet rel_pos_emb is used
+        if matrix_ac.shape != matrix_bd.shape:
+            matrix_bd = self.rel_shift(matrix_bd)
+        scores = (matrix_ac + matrix_bd) / math.sqrt(self.d_k)  # (batch, head, time1, time2)
+        return self.forward_attention(v, scores, mask), new_cache

inspiremusic/transformer/convolution.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
+#               2024 Alibaba Inc (Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""ConvolutionModule definition."""
+from typing import Tuple
+import torch
+from torch import nn
+class ConvolutionModule(nn.Module):
+    """ConvolutionModule in Conformer model."""
+    def __init__(self,
+                 channels: int,
+                 kernel_size: int = 15,
+                 activation: nn.Module = nn.ReLU(),
+                 norm: str = "batch_norm",
+                 causal: bool = False,
+                 bias: bool = True):
+        """Construct an ConvolutionModule object.
+        Args:
+            channels (int): The number of channels of conv layers.
+            kernel_size (int): Kernel size of conv layers.
+            causal (int): Whether use causal convolution or not
+        """
+        super().__init__()
+        self.pointwise_conv1 = nn.Conv1d(
+            channels,
+            2 * channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=bias,
+        )
+        # self.lorder is used to distinguish if it's a causal convolution,
+        # if self.lorder > 0: it's a causal convolution, the input will be
+        #    padded with self.lorder frames on the left in forward.
+        # else: it's a symmetrical convolution
+        if causal:
+            padding = 0
+            self.lorder = kernel_size - 1
+        else:
+            # kernel_size should be an odd number for none causal convolution
+            assert (kernel_size - 1) % 2 == 0
+            padding = (kernel_size - 1) // 2
+            self.lorder = 0
+        self.depthwise_conv = nn.Conv1d(
+            channels,
+            channels,
+            kernel_size,
+            stride=1,
+            padding=padding,
+            groups=channels,
+            bias=bias,
+        )
+        assert norm in ['batch_norm', 'layer_norm']
+        if norm == "batch_norm":
+            self.use_layer_norm = False
+            self.norm = nn.BatchNorm1d(channels)
+        else:
+            self.use_layer_norm = True
+            self.norm = nn.LayerNorm(channels)
+        self.pointwise_conv2 = nn.Conv1d(
+            channels,
+            channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=bias,
+        )
+        self.activation = activation
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        cache: torch.Tensor = torch.zeros((0, 0, 0)),
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute convolution module.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, channels).
+            mask_pad (torch.Tensor): used for batch padding (#batch, 1, time),
+                (0, 0, 0) means fake mask.
+            cache (torch.Tensor): left context cache, it is only
+                used in causal convolution (#batch, channels, cache_t),
+                (0, 0, 0) meas fake cache.
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, channels).
+        """
+        # exchange the temporal dimension and the feature dimension
+        x = x.transpose(1, 2)  # (#batch, channels, time)
+        # mask batch padding
+        if mask_pad.size(2) > 0:  # time > 0
+            x.masked_fill_(~mask_pad, 0.0)
+        if self.lorder > 0:
+            if cache.size(2) == 0:  # cache_t == 0
+                x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0)
+            else:
+                assert cache.size(0) == x.size(0)  # equal batch
+                assert cache.size(1) == x.size(1)  # equal channel
+                x = torch.cat((cache, x), dim=2)
+            assert (x.size(2) > self.lorder)
+            new_cache = x[:, :, -self.lorder:]
+        else:
+            # It's better we just return None if no cache is required,
+            # However, for JIT export, here we just fake one tensor instead of
+            # None.
+            new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
+        # GLU mechanism
+        x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
+        x = nn.functional.glu(x, dim=1)  # (batch, channel, dim)
+        # 1D Depthwise Conv
+        x = self.depthwise_conv(x)
+        if self.use_layer_norm:
+            x = x.transpose(1, 2)
+        x = self.activation(self.norm(x))
+        if self.use_layer_norm:
+            x = x.transpose(1, 2)
+        x = self.pointwise_conv2(x)
+        # mask batch padding
+        if mask_pad.size(2) > 0:  # time > 0
+            x.masked_fill_(~mask_pad, 0.0)
+        return x.transpose(1, 2), new_cache

inspiremusic/transformer/decoder.py ADDED Viewed

	@@ -0,0 +1,396 @@

+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
+#               2024 Alibaba Inc (Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Decoder definition."""
+from typing import Tuple, List, Optional
+import torch
+import torch.utils.checkpoint as ckpt
+import logging
+from inspiremusic.transformer.decoder_layer import DecoderLayer
+from inspiremusic.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from inspiremusic.utils.class_utils import (
+    INSPIREMUSIC_EMB_CLASSES,
+    INSPIREMUSIC_ATTENTION_CLASSES,
+    INSPIREMUSIC_ACTIVATION_CLASSES,
+)
+from inspiremusic.utils.mask import (subsequent_mask, make_pad_mask)
+class TransformerDecoder(torch.nn.Module):
+    """Base class of Transfomer decoder module.
+    Args:
+        vocab_size: output dim
+        encoder_output_size: dimension of attention
+        attention_heads: the number of heads of multi head attention
+        linear_units: the hidden units number of position-wise feedforward
+        num_blocks: the number of decoder blocks
+        dropout_rate: dropout rate
+        self_attention_dropout_rate: dropout rate for attention
+        input_layer: input layer type
+        use_output_layer: whether to use output layer
+        pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
+        normalize_before:
+            True: use layer_norm before each sub-block of a layer.
+            False: use layer_norm after each sub-block of a layer.
+        src_attention: if false, encoder-decoder cross attention is not
+                       applied, such as CIF model
+        key_bias: whether use bias in attention.linear_k, False for whisper models.
+        gradient_checkpointing: rerunning a forward-pass segment for each
+            checkpointed segment during backward.
+        tie_word_embedding: Tie or clone module weights depending of whether we are
+            using TorchScript or not
+    """
+    def __init__(
+        self,
+        vocab_size: int,
+        encoder_output_size: int,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        self_attention_dropout_rate: float = 0.0,
+        src_attention_dropout_rate: float = 0.0,
+        input_layer: str = "embed",
+        use_output_layer: bool = True,
+        normalize_before: bool = True,
+        src_attention: bool = True,
+        key_bias: bool = True,
+        activation_type: str = "relu",
+        gradient_checkpointing: bool = False,
+        tie_word_embedding: bool = False,
+    ):
+        super().__init__()
+        attention_dim = encoder_output_size
+        activation = INSPIREMUSIC_ACTIVATION_CLASSES[activation_type]()
+        self.embed = torch.nn.Sequential(
+            torch.nn.Identity() if input_layer == "no_pos" else
+            torch.nn.Embedding(vocab_size, attention_dim),
+            INSPIREMUSIC_EMB_CLASSES[input_layer](attention_dim,
+                                               positional_dropout_rate),
+        )
+        self.normalize_before = normalize_before
+        self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5)
+        self.use_output_layer = use_output_layer
+        if use_output_layer:
+            self.output_layer = torch.nn.Linear(attention_dim, vocab_size)
+        else:
+            self.output_layer = torch.nn.Identity()
+        self.num_blocks = num_blocks
+        self.decoders = torch.nn.ModuleList([
+            DecoderLayer(
+                attention_dim,
+                INSPIREMUSIC_ATTENTION_CLASSES["selfattn"](
+                    attention_heads, attention_dim,
+                    self_attention_dropout_rate, key_bias),
+                INSPIREMUSIC_ATTENTION_CLASSES["selfattn"](
+                    attention_heads, attention_dim, src_attention_dropout_rate,
+                    key_bias) if src_attention else None,
+                PositionwiseFeedForward(attention_dim, linear_units,
+                                        dropout_rate, activation),
+                dropout_rate,
+                normalize_before,
+            ) for _ in range(self.num_blocks)
+        ])
+        self.gradient_checkpointing = gradient_checkpointing
+        self.tie_word_embedding = tie_word_embedding
+    def forward(
+        self,
+        memory: torch.Tensor,
+        memory_mask: torch.Tensor,
+        ys_in_pad: torch.Tensor,
+        ys_in_lens: torch.Tensor,
+        r_ys_in_pad: torch.Tensor = torch.empty(0),
+        reverse_weight: float = 0.0,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Forward decoder.
+        Args:
+            memory: encoded memory, float32  (batch, maxlen_in, feat)
+            memory_mask: encoder memory mask, (batch, 1, maxlen_in)
+            ys_in_pad: padded input token ids, int64 (batch, maxlen_out)
+            ys_in_lens: input lengths of this batch (batch)
+            r_ys_in_pad: not used in transformer decoder, in order to unify api
+                with bidirectional decoder
+            reverse_weight: not used in transformer decoder, in order to unify
+                api with bidirectional decode
+        Returns:
+            (tuple): tuple containing:
+                x: decoded token score before softmax (batch, maxlen_out,
+                    vocab_size) if use_output_layer is True,
+                torch.tensor(0.0), in order to unify api with bidirectional decoder
+                olens: (batch, )
+        NOTE(xcsong):
+            We pass the `__call__` method of the modules instead of `forward` to the
+            checkpointing API because `__call__` attaches all the hooks of the module.
+            https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
+        """
+        tgt = ys_in_pad
+        maxlen = tgt.size(1)
+        # tgt_mask: (B, 1, L)
+        tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1)
+        tgt_mask = tgt_mask.to(tgt.device)
+        # m: (1, L, L)
+        m = subsequent_mask(tgt_mask.size(-1),
+                            device=tgt_mask.device).unsqueeze(0)
+        # tgt_mask: (B, L, L)
+        tgt_mask = tgt_mask & m
+        x, _ = self.embed(tgt)
+        if self.gradient_checkpointing and self.training:
+            x = self.forward_layers_checkpointed(x, tgt_mask, memory,
+                                                 memory_mask)
+        else:
+            x = self.forward_layers(x, tgt_mask, memory, memory_mask)
+        if self.normalize_before:
+            x = self.after_norm(x)
+        if self.use_output_layer:
+            x = self.output_layer(x)
+        olens = tgt_mask.sum(1)
+        return x, torch.tensor(0.0), olens
+    def forward_layers(self, x: torch.Tensor, tgt_mask: torch.Tensor,
+                       memory: torch.Tensor,
+                       memory_mask: torch.Tensor) -> torch.Tensor:
+        for layer in self.decoders:
+            x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory,
+                                                     memory_mask)
+        return x
+    @torch.jit.unused
+    def forward_layers_checkpointed(self, x: torch.Tensor,
+                                    tgt_mask: torch.Tensor,
+                                    memory: torch.Tensor,
+                                    memory_mask: torch.Tensor) -> torch.Tensor:
+        for layer in self.decoders:
+            x, tgt_mask, memory, memory_mask = ckpt.checkpoint(
+                layer.__call__, x, tgt_mask, memory, memory_mask)
+        return x
+    def forward_one_step(
+        self,
+        memory: torch.Tensor,
+        memory_mask: torch.Tensor,
+        tgt: torch.Tensor,
+        tgt_mask: torch.Tensor,
+        cache: Optional[List[torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        """Forward one step.
+            This is only used for decoding.
+        Args:
+            memory: encoded memory, float32  (batch, maxlen_in, feat)
+            memory_mask: encoded memory mask, (batch, 1, maxlen_in)
+            tgt: input token ids, int64 (batch, maxlen_out)
+            tgt_mask: input token mask,  (batch, maxlen_out)
+                      dtype=torch.uint8 in PyTorch 1.2-
+                      dtype=torch.bool in PyTorch 1.2+ (include 1.2)
+            cache: cached output list of (batch, max_time_out-1, size)
+        Returns:
+            y, cache: NN output value and cache per `self.decoders`.
+            y.shape` is (batch, maxlen_out, token)
+        """
+        x, _ = self.embed(tgt)
+        new_cache = []
+        for i, decoder in enumerate(self.decoders):
+            if cache is None:
+                c = None
+            else:
+                c = cache[i]
+            x, tgt_mask, memory, memory_mask = decoder(x,
+                                                       tgt_mask,
+                                                       memory,
+                                                       memory_mask,
+                                                       cache=c)
+            new_cache.append(x)
+        if self.normalize_before:
+            y = self.after_norm(x[:, -1])
+        else:
+            y = x[:, -1]
+        if self.use_output_layer:
+            y = torch.log_softmax(self.output_layer(y), dim=-1)
+        return y, new_cache
+    def tie_or_clone_weights(self, jit_mode: bool = True):
+        """Tie or clone module weights (between word_emb and output_layer)
+            depending of whether we are using TorchScript or not"""
+        if not self.use_output_layer:
+            return
+        if jit_mode:
+            logging.info("clone emb.weight to output.weight")
+            self.output_layer.weight = torch.nn.Parameter(
+                self.embed[0].weight.clone())
+        else:
+            logging.info("tie emb.weight with output.weight")
+            self.output_layer.weight = self.embed[0].weight
+        if getattr(self.output_layer, "bias", None) is not None:
+            self.output_layer.bias.data = torch.nn.functional.pad(
+                self.output_layer.bias.data,
+                (
+                    0,
+                    self.output_layer.weight.shape[0] -
+                    self.output_layer.bias.shape[0],
+                ),
+                "constant",
+                0,
+            )
+class BiTransformerDecoder(torch.nn.Module):
+    """Base class of Transfomer decoder module.
+    Args:
+        vocab_size: output dim
+        encoder_output_size: dimension of attention
+        attention_heads: the number of heads of multi head attention
+        linear_units: the hidden units number of position-wise feedforward
+        num_blocks: the number of decoder blocks
+        r_num_blocks: the number of right to left decoder blocks
+        dropout_rate: dropout rate
+        self_attention_dropout_rate: dropout rate for attention
+        input_layer: input layer type
+        use_output_layer: whether to use output layer
+        pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
+        normalize_before:
+            True: use layer_norm before each sub-block of a layer.
+            False: use layer_norm after each sub-block of a layer.
+        key_bias: whether use bias in attention.linear_k, False for whisper models.
+    """
+    def __init__(
+        self,
+        vocab_size: int,
+        encoder_output_size: int,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        r_num_blocks: int = 0,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        self_attention_dropout_rate: float = 0.0,
+        src_attention_dropout_rate: float = 0.0,
+        input_layer: str = "embed",
+        use_output_layer: bool = True,
+        normalize_before: bool = True,
+        key_bias: bool = True,
+        gradient_checkpointing: bool = False,
+        tie_word_embedding: bool = False,
+    ):
+        super().__init__()
+        self.tie_word_embedding = tie_word_embedding
+        self.left_decoder = TransformerDecoder(
+            vocab_size,
+            encoder_output_size,
+            attention_heads,
+            linear_units,
+            num_blocks,
+            dropout_rate,
+            positional_dropout_rate,
+            self_attention_dropout_rate,
+            src_attention_dropout_rate,
+            input_layer,
+            use_output_layer,
+            normalize_before,
+            key_bias=key_bias,
+            gradient_checkpointing=gradient_checkpointing,
+            tie_word_embedding=tie_word_embedding)
+        self.right_decoder = TransformerDecoder(
+            vocab_size,
+            encoder_output_size,
+            attention_heads,
+            linear_units,
+            r_num_blocks,
+            dropout_rate,
+            positional_dropout_rate,
+            self_attention_dropout_rate,
+            src_attention_dropout_rate,
+            input_layer,
+            use_output_layer,
+            normalize_before,
+            key_bias=key_bias,
+            gradient_checkpointing=gradient_checkpointing,
+            tie_word_embedding=tie_word_embedding)
+    def forward(
+        self,
+        memory: torch.Tensor,
+        memory_mask: torch.Tensor,
+        ys_in_pad: torch.Tensor,
+        ys_in_lens: torch.Tensor,
+        r_ys_in_pad: torch.Tensor,
+        reverse_weight: float = 0.0,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Forward decoder.
+        Args:
+            memory: encoded memory, float32  (batch, maxlen_in, feat)
+            memory_mask: encoder memory mask, (batch, 1, maxlen_in)
+            ys_in_pad: padded input token ids, int64 (batch, maxlen_out)
+            ys_in_lens: input lengths of this batch (batch)
+            r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out),
+                used for right to left decoder
+            reverse_weight: used for right to left decoder
+        Returns:
+            (tuple): tuple containing:
+                x: decoded token score before softmax (batch, maxlen_out,
+                    vocab_size) if use_output_layer is True,
+                r_x: x: decoded token score (right to left decoder)
+                    before softmax (batch, maxlen_out, vocab_size)
+                    if use_output_layer is True,
+                olens: (batch, )
+        """
+        l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad,
+                                          ys_in_lens)
+        r_x = torch.tensor(0.0)
+        if reverse_weight > 0.0:
+            r_x, _, olens = self.right_decoder(memory, memory_mask,
+                                               r_ys_in_pad, ys_in_lens)
+        return l_x, r_x, olens
+    def forward_one_step(
+        self,
+        memory: torch.Tensor,
+        memory_mask: torch.Tensor,
+        tgt: torch.Tensor,
+        tgt_mask: torch.Tensor,
+        cache: Optional[List[torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        """Forward one step.
+            This is only used for decoding.
+        Args:
+            memory: encoded memory, float32  (batch, maxlen_in, feat)
+            memory_mask: encoded memory mask, (batch, 1, maxlen_in)
+            tgt: input token ids, int64 (batch, maxlen_out)
+            tgt_mask: input token mask,  (batch, maxlen_out)
+                      dtype=torch.uint8 in PyTorch 1.2-
+                      dtype=torch.bool in PyTorch 1.2+ (include 1.2)
+            cache: cached output list of (batch, max_time_out-1, size)
+        Returns:
+            y, cache: NN output value and cache per `self.decoders`.
+            y.shape` is (batch, maxlen_out, token)
+        """
+        return self.left_decoder.forward_one_step(memory, memory_mask, tgt,
+                                                  tgt_mask, cache)
+    def tie_or_clone_weights(self, jit_mode: bool = True):
+        """Tie or clone module weights (between word_emb and output_layer)
+            depending of whether we are using TorchScript or not"""
+        self.left_decoder.tie_or_clone_weights(jit_mode)
+        self.right_decoder.tie_or_clone_weights(jit_mode)

inspiremusic/transformer/decoder_layer.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# Copyright (c) 2019 Shigeki Karita
+#               2020 Mobvoi Inc (Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Decoder self-attention layer definition."""
+from typing import Optional, Tuple
+import torch
+from torch import nn
+class DecoderLayer(nn.Module):
+    """Single decoder layer module.
+    Args:
+        size (int): Input dimension.
+        self_attn (torch.nn.Module): Self-attention module instance.
+            `MultiHeadedAttention` instance can be used as the argument.
+        src_attn (torch.nn.Module): Inter-attention module instance.
+            `MultiHeadedAttention` instance can be used as the argument.
+            If `None` is passed, Inter-attention is not used, such as
+            CIF, GPT, and other decoder only model.
+        feed_forward (torch.nn.Module): Feed-forward module instance.
+            `PositionwiseFeedForward` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool):
+            True: use layer_norm before each sub-block.
+            False: to use layer_norm after each sub-block.
+    """
+    def __init__(
+        self,
+        size: int,
+        self_attn: nn.Module,
+        src_attn: Optional[nn.Module],
+        feed_forward: nn.Module,
+        dropout_rate: float,
+        normalize_before: bool = True,
+    ):
+        """Construct an DecoderLayer object."""
+        super().__init__()
+        self.size = size
+        self.self_attn = self_attn
+        self.src_attn = src_attn
+        self.feed_forward = feed_forward
+        self.norm1 = nn.LayerNorm(size, eps=1e-5)
+        self.norm2 = nn.LayerNorm(size, eps=1e-5)
+        self.norm3 = nn.LayerNorm(size, eps=1e-5)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.normalize_before = normalize_before
+    def forward(
+        self,
+        tgt: torch.Tensor,
+        tgt_mask: torch.Tensor,
+        memory: torch.Tensor,
+        memory_mask: torch.Tensor,
+        cache: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Compute decoded features.
+        Args:
+            tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
+            tgt_mask (torch.Tensor): Mask for input tensor
+                (#batch, maxlen_out).
+            memory (torch.Tensor): Encoded memory
+                (#batch, maxlen_in, size).
+            memory_mask (torch.Tensor): Encoded memory mask
+                (#batch, maxlen_in).
+            cache (torch.Tensor): cached tensors.
+                (#batch, maxlen_out - 1, size).
+        Returns:
+            torch.Tensor: Output tensor (#batch, maxlen_out, size).
+            torch.Tensor: Mask for output tensor (#batch, maxlen_out).
+            torch.Tensor: Encoded memory (#batch, maxlen_in, size).
+            torch.Tensor: Encoded memory mask (#batch, maxlen_in).
+        """
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm1(tgt)
+        if cache is None:
+            tgt_q = tgt
+            tgt_q_mask = tgt_mask
+        else:
+            # compute only the last frame query keeping dim: max_time_out -> 1
+            assert cache.shape == (
+                tgt.shape[0],
+                tgt.shape[1] - 1,
+                self.size,
+            ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}"
+            tgt_q = tgt[:, -1:, :]
+            residual = residual[:, -1:, :]
+            tgt_q_mask = tgt_mask[:, -1:, :]
+        x = residual + self.dropout(
+            self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0])
+        if not self.normalize_before:
+            x = self.norm1(x)
+        if self.src_attn is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm2(x)
+            x = residual + self.dropout(
+                self.src_attn(x, memory, memory, memory_mask)[0])
+            if not self.normalize_before:
+                x = self.norm2(x)
+        residual = x
+        if self.normalize_before:
+            x = self.norm3(x)
+        x = residual + self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm3(x)
+        if cache is not None:
+            x = torch.cat([cache, x], dim=1)
+        return x, tgt_mask, memory, memory_mask

inspiremusic/transformer/embedding.py ADDED Viewed

	@@ -0,0 +1,294 @@

+# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
+#               2024 Alibaba Inc (Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Positonal Encoding Module."""
+import math
+from typing import Tuple, Union
+import torch
+import torch.nn.functional as F
+import numpy as np
+class PositionalEncoding(torch.nn.Module):
+    """Positional encoding.
+    :param int d_model: embedding dim
+    :param float dropout_rate: dropout rate
+    :param int max_len: maximum input length
+    PE(pos, 2i)   = sin(pos/(10000^(2i/dmodel)))
+    PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
+    """
+    def __init__(self,
+                 d_model: int,
+                 dropout_rate: float,
+                 max_len: int = 5000,
+                 reverse: bool = False):
+        """Construct an PositionalEncoding object."""
+        super().__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.max_len = max_len
+        self.pe = torch.zeros(self.max_len, self.d_model)
+        position = torch.arange(0, self.max_len,
+                                dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32) *
+            -(math.log(10000.0) / self.d_model))
+        self.pe[:, 0::2] = torch.sin(position * div_term)
+        self.pe[:, 1::2] = torch.cos(position * div_term)
+        self.pe = self.pe.unsqueeze(0)
+    def forward(self,
+                x: torch.Tensor,
+                offset: Union[int, torch.Tensor] = 0) \
+            -> Tuple[torch.Tensor, torch.Tensor]:
+        """Add positional encoding.
+        Args:
+            x (torch.Tensor): Input. Its shape is (batch, time, ...)
+            offset (int, torch.tensor): position offset
+        Returns:
+            torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
+            torch.Tensor: for compatibility to RelPositionalEncoding
+        """
+        self.pe = self.pe.to(x.device)
+        pos_emb = self.position_encoding(offset, x.size(1), False)
+        x = x * self.xscale + pos_emb
+        return self.dropout(x), self.dropout(pos_emb)
+    def position_encoding(self,
+                          offset: Union[int, torch.Tensor],
+                          size: int,
+                          apply_dropout: bool = True) -> torch.Tensor:
+        """ For getting encoding in a streaming fashion
+        Attention!!!!!
+        we apply dropout only once at the whole utterance level in a none
+        streaming way, but will call this function several times with
+        increasing input size in a streaming scenario, so the dropout will
+        be applied several times.
+        Args:
+            offset (int or torch.tensor): start offset
+            size (int): required size of position encoding
+        Returns:
+            torch.Tensor: Corresponding encoding
+        """
+        # How to subscript a Union type:
+        #   https://github.com/pytorch/pytorch/issues/69434
+        if isinstance(offset, int):
+            assert offset + size <= self.max_len
+            pos_emb = self.pe[:, offset:offset + size]
+        elif isinstance(offset, torch.Tensor) and offset.dim() == 0:  # scalar
+            assert offset + size <= self.max_len
+            pos_emb = self.pe[:, offset:offset + size]
+        else:  # for batched streaming decoding on GPU
+            assert torch.max(offset) + size <= self.max_len
+            index = offset.unsqueeze(1) + \
+                torch.arange(0, size).to(offset.device)  # B X T
+            flag = index > 0
+            # remove negative offset
+            index = index * flag
+            pos_emb = F.embedding(index, self.pe[0])  # B X T X d_model
+        if apply_dropout:
+            pos_emb = self.dropout(pos_emb)
+        return pos_emb
+class RelPositionalEncoding(PositionalEncoding):
+    """Relative positional encoding module.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
+        """Initialize class."""
+        super().__init__(d_model, dropout_rate, max_len, reverse=True)
+    def forward(self,
+                x: torch.Tensor,
+                offset: Union[int, torch.Tensor] = 0) \
+            -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+            torch.Tensor: Positional embedding tensor (1, time, `*`).
+        """
+        self.pe = self.pe.to(x.device)
+        x = x * self.xscale
+        pos_emb = self.position_encoding(offset, x.size(1), False)
+        return self.dropout(x), self.dropout(pos_emb)
+class WhisperPositionalEncoding(PositionalEncoding):
+    """ Sinusoids position encoding used in openai-whisper.encoder
+    """
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 1500):
+        super().__init__(d_model, dropout_rate, max_len)
+        self.xscale = 1.0
+        log_timescale_increment = np.log(10000) / (d_model // 2 - 1)
+        inv_timescales = torch.exp(-log_timescale_increment *
+                                   torch.arange(d_model // 2))
+        scaled_time = torch.arange(max_len)[:, np.newaxis] * \
+            inv_timescales[np.newaxis, :]
+        pe = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
+        delattr(self, "pe")
+        self.register_buffer("pe", pe.unsqueeze(0))
+class LearnablePositionalEncoding(PositionalEncoding):
+    """ Learnable position encoding used in openai-whisper.decoder
+    """
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 448):
+        super().__init__(d_model, dropout_rate, max_len)
+        # NOTE(xcsong): overwrite self.pe & self.xscale
+        self.pe = torch.nn.Parameter(torch.empty(1, max_len, d_model))
+        self.xscale = 1.0
+class NoPositionalEncoding(torch.nn.Module):
+    """ No position encoding
+    """
+    def __init__(self, d_model: int, dropout_rate: float):
+        super().__init__()
+        self.d_model = d_model
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+    def forward(self,
+                x: torch.Tensor,
+                offset: Union[int, torch.Tensor] = 0) \
+            -> Tuple[torch.Tensor, torch.Tensor]:
+        """ Just return zero vector for interface compatibility
+        """
+        pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device)
+        return self.dropout(x), pos_emb
+    def position_encoding(self, offset: Union[int, torch.Tensor],
+                          size: int) -> torch.Tensor:
+        return torch.zeros(1, size, self.d_model)
+class EspnetRelPositionalEncoding(torch.nn.Module):
+    """Relative positional encoding module (new implementation).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
+        """Construct an PositionalEncoding object."""
+        super(EspnetRelPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+    def extend_pe(self, x: torch.Tensor):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if self.pe.size(1) >= x.size(1) * 2 - 1:
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        # Suppose `i` means to the position of query vecotr and `j` means the
+        # position of key vector. We use position relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        pe_positive = torch.zeros(x.size(1), self.d_model)
+        pe_negative = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32)
+            * -(math.log(10000.0) / self.d_model)
+        )
+        pe_positive[:, 0::2] = torch.sin(position * div_term)
+        pe_positive[:, 1::2] = torch.cos(position * div_term)
+        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
+        # Reserve the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://arxiv.org/abs/1901.02860
+        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = torch.cat([pe_positive, pe_negative], dim=1)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+    def forward(self, x: torch.Tensor, offset: Union[int, torch.Tensor] = 0) \
+            -> Tuple[torch.Tensor, torch.Tensor]:
+        """Add positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x * self.xscale
+        pos_emb = self.position_encoding(size=x.size(1), offset=offset)
+        return self.dropout(x), self.dropout(pos_emb)
+    def position_encoding(self,
+                          offset: Union[int, torch.Tensor],
+                          size: int) -> torch.Tensor:
+        """ For getting encoding in a streaming fashion
+        Attention!!!!!
+        we apply dropout only once at the whole utterance level in a none
+        streaming way, but will call this function several times with
+        increasing input size in a streaming scenario, so the dropout will
+        be applied several times.
+        Args:
+            offset (int or torch.tensor): start offset
+            size (int): required size of position encoding
+        Returns:
+            torch.Tensor: Corresponding encoding
+        """
+        pos_emb = self.pe[
+            :,
+            self.pe.size(1) // 2 - size + 1: self.pe.size(1) // 2 + size,
+        ]
+        return pos_emb