Spaces:

FunAudioLLM
/

InspireMusic

Running on Zero

App Files Files Community

chong.zhang commited on Feb 4

Commit

96fe5d9

1 Parent(s): 43dbb02

update

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +205 -263
inspiremusic/__init__.py +0 -0
inspiremusic/bin/export_jit.py +74 -0
inspiremusic/bin/export_onnx.py +112 -0
inspiremusic/bin/flow_only_infer.py +150 -0
inspiremusic/bin/inference.py +266 -0
inspiremusic/bin/train.py +194 -0
inspiremusic/cli/__init__.py +0 -0
inspiremusic/cli/frontend.py +106 -0
inspiremusic/cli/inference.py +296 -0
inspiremusic/cli/inspiremusic.py +133 -0
inspiremusic/cli/model.py +297 -0
inspiremusic/dataset/__init__.py +0 -0
inspiremusic/dataset/dataset.py +154 -0
inspiremusic/dataset/processor.py +595 -0
inspiremusic/flow/decoder.py +277 -0
inspiremusic/flow/flow.py +143 -0
inspiremusic/flow/flow_matching.py +167 -0
inspiremusic/flow/length_regulator.py +69 -0
inspiremusic/hifigan/discriminator.py +140 -0
inspiremusic/hifigan/f0_predictor.py +55 -0
inspiremusic/hifigan/generator.py +411 -0
inspiremusic/hifigan/hifigan.py +66 -0
inspiremusic/llm/llm.py +402 -0
inspiremusic/metrics/clap_score.py +135 -0
inspiremusic/metrics/openl3_fd.py +338 -0
inspiremusic/metrics/passt_kld.py +232 -0
inspiremusic/music_tokenizer/__init__.py +0 -0
inspiremusic/music_tokenizer/env.py +29 -0
inspiremusic/music_tokenizer/meldataset.py +226 -0
inspiremusic/music_tokenizer/models.py +548 -0
inspiremusic/music_tokenizer/vqvae.py +58 -0
inspiremusic/text/abs_tokenizer.py +34 -0
inspiremusic/text/tokenizer.py +76 -0
inspiremusic/transformer/__init__.py +0 -0
inspiremusic/transformer/activation.py +84 -0
inspiremusic/transformer/attention.py +328 -0
inspiremusic/transformer/convolution.py +145 -0
inspiremusic/transformer/decoder.py +396 -0
inspiremusic/transformer/decoder_layer.py +132 -0
inspiremusic/transformer/embedding.py +294 -0
inspiremusic/transformer/encoder.py +477 -0
inspiremusic/transformer/encoder_layer.py +235 -0
inspiremusic/transformer/label_smoothing_loss.py +97 -0
inspiremusic/transformer/positionwise_feed_forward.py +115 -0
inspiremusic/transformer/qwen_encoder.py +165 -0
inspiremusic/transformer/subsampling.py +384 -0
inspiremusic/utils/__init__.py +0 -0
inspiremusic/utils/audio_utils.py +623 -0
inspiremusic/utils/binary.py +155 -0

app.py CHANGED Viewed

@@ -1,193 +1,129 @@
 # coding=utf-8
-import os
-import librosa
-import base64
 import io
-import gradio as gr
-import re
 import numpy as np
-import torch
 import torchaudio
-from modelscope import HubApi
-api = HubApi()
-key = os.environ["apikey"] if "apikey" in os.environ else ""
-try:
-	api.login(key)
-except:
-	pass
-from funasr import AutoModel
-model = "iic/SenseVoiceSmall"
-model = AutoModel(model=model,
-				  vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
-				  vad_kwargs={"max_single_segment_time": 30000},
-				  trust_remote_code=True,
-				  )
-import re
-emo_dict = {
-	"<|HAPPY|>": "😊",
-	"<|SAD|>": "😔",
-	"<|ANGRY|>": "😡",
-	"<|NEUTRAL|>": "",
-	"<|FEARFUL|>": "😰",
-	"<|DISGUSTED|>": "🤢",
-	"<|SURPRISED|>": "😮",
-}
-event_dict = {
-	"<|BGM|>": "🎼",
-	"<|Speech|>": "",
-	"<|Applause|>": "👏",
-	"<|Laughter|>": "😀",
-	"<|Cry|>": "😭",
-	"<|Sneeze|>": "🤧",
-	"<|Breath|>": "",
-	"<|Cough|>": "🤧",
-}
-emoji_dict = {
-	"<|nospeech|><|Event_UNK|>": "❓",
-	"<|zh|>": "",
-	"<|en|>": "",
-	"<|yue|>": "",
-	"<|ja|>": "",
-	"<|ko|>": "",
-	"<|nospeech|>": "",
-	"<|HAPPY|>": "😊",
-	"<|SAD|>": "😔",
-	"<|ANGRY|>": "😡",
-	"<|NEUTRAL|>": "",
-	"<|BGM|>": "🎼",
-	"<|Speech|>": "",
-	"<|Applause|>": "👏",
-	"<|Laughter|>": "😀",
-	"<|FEARFUL|>": "😰",
-	"<|DISGUSTED|>": "🤢",
-	"<|SURPRISED|>": "😮",
-	"<|Cry|>": "😭",
-	"<|EMO_UNKNOWN|>": "",
-	"<|Sneeze|>": "🤧",
-	"<|Breath|>": "",
-	"<|Cough|>": "😷",
-	"<|Sing|>": "",
-	"<|Speech_Noise|>": "",
-	"<|withitn|>": "",
-	"<|woitn|>": "",
-	"<|GBG|>": "",
-	"<|Event_UNK|>": "",
-}
-lang_dict =  {
-    "<|zh|>": "<|lang|>",
-    "<|en|>": "<|lang|>",
-    "<|yue|>": "<|lang|>",
-    "<|ja|>": "<|lang|>",
-    "<|ko|>": "<|lang|>",
-    "<|nospeech|>": "<|lang|>",
-}
-emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"}
-event_set = {"🎼", "👏", "😀", "😭", "🤧", "😷",}
-notes = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
-def format_str(s):
-	for sptk in emoji_dict:
-		s = s.replace(sptk, emoji_dict[sptk])
-	return s
-def format_str_v2(s):
-	sptk_dict = {}
-	for sptk in emoji_dict:
-		sptk_dict[sptk] = s.count(sptk)
-		s = s.replace(sptk, "")
-	emo = "<|NEUTRAL|>"
-	for e in emo_dict:
-		if sptk_dict[e] > sptk_dict[emo]:
-			emo = e
-	for e in event_dict:
-		if sptk_dict[e] > 0:
-			s = event_dict[e] + s
-	s = s + emo_dict[emo]
-	for emoji in emo_set.union(event_set):
-		s = s.replace(" " + emoji, emoji)
-		s = s.replace(emoji + " ", emoji)
-	return s.strip()
-def format_str_v3(s):
-	def get_emo(s):
-		return s[-1] if s[-1] in emo_set else None
-	def get_event(s):
-		return s[0] if s[0] in event_set else None
-	s = s.replace("<|nospeech|><|Event_UNK|>", "❓")
-	for lang in lang_dict:
-		s = s.replace(lang, "<|lang|>")
-	s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")]
-	new_s = " " + s_list[0]
-	cur_ent_event = get_event(new_s)
-	for i in range(1, len(s_list)):
-		if len(s_list[i]) == 0:
-			continue
-		if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None:
-			s_list[i] = s_list[i][1:]
-		#else:
-		cur_ent_event = get_event(s_list[i])
-		if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s):
-			new_s = new_s[:-1]
-		new_s += s_list[i].strip().lstrip()
-	new_s = new_s.replace("The.", " ")
-	return new_s.strip()
-def model_inference(input_wav, language, fs=16000):
-	# task_abbr = {"Speech Recognition": "ASR", "Rich Text Transcription": ("ASR", "AED", "SER")}
-	language_abbr = {"auto": "auto", "zh": "zh", "en": "en", "yue": "yue", "ja": "ja", "ko": "ko",
-					 "nospeech": "nospeech"}
-	# task = "Speech Recognition" if task is None else task
-	language = "auto" if len(language) < 1 else language
-	selected_language = language_abbr[language]
-	# selected_task = task_abbr.get(task)
-	# print(f"input_wav: {type(input_wav)}, {input_wav[1].shape}, {input_wav}")
-	if isinstance(input_wav, tuple):
-		fs, input_wav = input_wav
-		input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
-		if len(input_wav.shape) > 1:
-			input_wav = input_wav.mean(-1)
-		if fs != 16000:
-			print(f"audio_fs: {fs}")
-			resampler = torchaudio.transforms.Resample(fs, 16000)
-			input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
-			input_wav = resampler(input_wav_t[None, :])[0, :].numpy()
-	merge_vad = True
-	print(f"language: {language}, merge_vad: {merge_vad}")
-	text = model.generate(input=input_wav,
-						  cache={},
-						  language=language,
-						  use_itn=True,
-						  batch_size_s=300, merge_vad=merge_vad)
-	print(text)
-	text = text[0]["text"]
-	text = format_str_v3(text)
-	print(text)
-	return text
 audio_examples = [
     ["example/inspiremusic/inspiremusic_01.wav", "text-to-music"],
@@ -218,7 +154,7 @@ description = """
 - `The instrumental rock piece features a prominent bass guitar, delivering a pure and energetic sound.`
 - `A serene blend of instrumental and light pop, featuring soothing melodies and a gentle, soulful keyboard performance.`
-Recommended select audio duration is below 30 seconds. For audio longer than 30 seconds, local deployment is recommended, github repo.
 """
@@ -232,86 +168,92 @@ html_content = """
 	<p style="font-size: 18px;margin-left: 20px;"><a href="https://github.com/FunAudioLLM/InspireMusic" target="_blank">Code</a> </p>
 	<p style="font-size: 18px;margin-left: 20px;"><a href="https://iris2c.github.io/InspireMusic" target="_blank">Demo</a></p>
 	<h2 style="font-size: 22px;margin-left: 0px;">Models</h2>
-	<p style="font-size: 18px;margin-left: 20px;"><a href="https://modelscope.cn/models/iic/InspireMusic/summary" target="_blank">Modelscope Model</a>: </p>
-	<p style="font-size: 18px;margin-left: 20px;"><a href="https://huggingface.co/FunAudioLLM/InspireMusic-Base" target="_blank">Huggingface Model</a></p>
 </div>
 """
-# 自定义表格的 HTML 和 CSS 代码
-centered_table_html = """
-<style>
-.centered-table {
-  margin-left: auto;
-  margin-right: auto;
-}
-</style>
-<div class="centered-table">
-    <table border="1" style="border-collapse: collapse; width: 100%;">
-        <tr>
-            <th>Samples</th>
-            <th>InspireMusic</th>
-            <th>Text-to-Music</th>
-        </tr>
-        <tr>
-            <td><a href="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/InspireMusic/demo/inspiremusic/inspiremusic_01.wav?OSSAccessKeyId=LTAI4Fmg1PUZcHLPSMGznooK&Expires=1734163633&Signature=hGhy9ACAm0ETPAGEyPhs%2BWkosrY%3D" target="_blank">normal mode</a></td>
-            <td>Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance.</td>
-        </tr>
-        <tr>
-            <td><a href="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/InspireMusic/demo/inspiremusic/inspiremusic_noflow_01.wav?OSSAccessKeyId=LTAI4Fmg1PUZcHLPSMGznooK&Expires=1737403768&Signature=1AdAJxwLfGBajej0AIYk3oN0%2Bw8%3D" target="_blank">fast mode</a></td>
-            <td>The instrumental piece exudes a playful and whimsical atmosphere, likely featuring lively and rhythmic elements. The music seems to be inspired by nature and animals, creating an engaging and light-hearted experience.</td>
-        </tr>
-    </table>
-</div>
-"""
-def launch():
-	with gr.Blocks(theme=gr.themes.Soft()) as demo:
-		# gr.Markdown(description)
-		gr.HTML(html_content)
-		with gr.Column():
-			with gr.Row():
-				with gr.Column():
-					text_inputs = gr.Textbox(
-							label="Input Text",
-							placeholder="Enter the text you want to generate music, e.g., Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance.",
-							lines=3
-					)
-					fn_button = gr.Button("Start", variant="primary")
-				audio_inputs = gr.Audio(
-						label="Upload prompt audio",
-				)
-				with gr.Column():
-					with gr.Accordion("Configuration"):
-						# task_inputs = gr.Radio(choices=["Speech Recognition", "Rich Text Transcription"],
-						# 					   value="Speech Recognition", label="Task")
-						task_inputs = gr.Dropdown(choices=["text-to-music", "music-continuation"],
-													  value="text-to-music",
-													  label="Task")
-					inference_mode_inputs = gr.Dropdown(choices=["normal", "fast"],
-													  value="normal",
-													  label="Inference Mode")
-					cfg_input = gr.Slider(3, 10, step=1, label="CFG value")
-					audio_length = gr.Textbox(value="30",
-												  label="Duration in seconds")
-				gr.Examples(examples=audio_examples,
-						inputs=[text_inputs, audio_inputs, task_inputs],
-						examples_per_page=5)
-			audio_output = gr.Audio(label="Audio Output")
-		fn_button.click(model_inference, inputs=[text_inputs, audio_inputs, task_inputs], outputs=audio_output)
-		# with gr.Accordion("More examples"):
-		# 	gr.HTML(centered_table_html)
-	demo.launch()
-if __name__ == "__main__":
-	# iface.launch()
-	launch()

 # coding=utf-8
 import io
 import numpy as np
 import torchaudio
+import torch
+import soundfile as sf
+import gradio as gr
+import spaces
+from inspiremusic.cli.inference import InspireMusicUnified, set_env_variables
+import os
+import sys
+def get_args():
+    parser = argparse.ArgumentParser(
+        description='Run inference with your model')
+    parser.add_argument('-m', '--model_name', default="InspireMusic-1.5B-Long",
+                        help='Model name')
+    parser.add_argument('-d', '--model_dir',
+                        help='Model folder path')
+    parser.add_argument('-t', '--text',
+                        default="Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance.",
+                        help='Prompt text')
+    parser.add_argument('-a', '--audio_prompt', default=None,
+                        help='Prompt audio')
+    parser.add_argument('-c', '--chorus', default="intro",
+                        help='Chorus tag generation mode (e.g., random, verse, chorus, intro, outro)')
+    parser.add_argument('--fast', type=bool, default=False,
+                        help='Enable fast inference mode (without flow matching)')
+    parser.add_argument('-g', '--gpu', type=int, default=0,
+                        help='GPU ID for this rank, -1 for CPU')
+    parser.add_argument('--task', default='text-to-music',
+                        choices=['text-to-music', 'continuation', 'reconstruct', 'super_resolution'],
+                        help='Inference task type: text-to-music, continuation, reconstruct, super_resolution')
+    parser.add_argument('-r', '--result_dir', default="exp/inspiremusic",
+                        help='Directory to save generated audio')
+    parser.add_argument('-o', '--output_fn', default="output_audio",
+                        help='Output file name')
+    parser.add_argument('-f', '--format', type=str, default="wav",
+                        choices=["wav", "mp3", "m4a", "flac"],
+                        help='Format of output audio')
+    parser.add_argument('--sample_rate', type=int, default=24000,
+                        help='Sampling rate of input audio')
+    parser.add_argument('--output_sample_rate', type=int, default=48000,
+                        choices=[24000, 48000],
+                        help='Sampling rate of generated output audio')
+    parser.add_argument('-s', '--time_start', type=float, default=0.0,
+                        help='Start time in seconds')
+    parser.add_argument('-e', '--time_end', type=float, default=30.0,
+                        help='End time in seconds')
+    parser.add_argument('--max_audio_prompt_length', type=float, default=5.0,
+                        help='Maximum audio prompt length in seconds')
+    parser.add_argument('--min_generate_audio_seconds', type=float,
+                        default=10.0,
+                        help='Minimum generated audio length in seconds')
+    parser.add_argument('--max_generate_audio_seconds', type=float,
+                        default=30.0,
+                        help='Maximum generated audio length in seconds')
+    parser.add_argument('--fp16', type=bool, default=True,
+                        help='Inference with fp16 model')
+    parser.add_argument('--fade_out', type=bool, default=True,
+                        help='Apply fade out effect to generated audio')
+    parser.add_argument('--fade_out_duration', type=float, default=1.0,
+                        help='Fade out duration in seconds')
+    parser.add_argument('--trim', type=bool, default=False,
+                        help='Trim the silence ending of generated audio')
+    args = parser.parse_args()
+    if not args.model_dir:
+        args.model_dir = os.path.join("./pretrained_models", args.model_name)
+    print(args)
+    return args
+def InspireMusic(args):
+    set_env_variables()
+    model = InspireMusicUnified(model_name=args.model_name,
+                            model_dir=args.model_dir,
+                            min_generate_audio_seconds=args.min_generate_audio_seconds,
+                            max_generate_audio_seconds=args.max_generate_audio_seconds,
+                            sample_rate=args.sample_rate,
+                            output_sample_rate=args.output_sample_rate,
+                            load_jit=True,
+                            load_onnx=False,
+                            fast=args.fast,
+                            fp16=args.fp16,
+                            gpu=args.gpu,
+                            result_dir=args.result_dir)
+    model.inference(task=args.task,
+                text=args.text,
+                audio_prompt=args.audio_prompt,
+                chorus=args.chorus,
+                time_start=args.time_start,
+                time_end=args.time_end,
+                output_fn=args.output_fn,
+                max_audio_prompt_length=args.max_audio_prompt_length,
+                fade_out_duration=args.fade_out_duration,
+                output_format=args.format,
+                fade_out_mode=args.fade_out,
+                trim=args.trim)
+    return os.path.join(args.result_dir, f"{args.output_fn}.{args.format}")
 audio_examples = [
     ["example/inspiremusic/inspiremusic_01.wav", "text-to-music"],
 - `The instrumental rock piece features a prominent bass guitar, delivering a pure and energetic sound.`
 - `A serene blend of instrumental and light pop, featuring soothing melodies and a gentle, soulful keyboard performance.`
+Recommended audio prompt duration is 5 seconds, generate audio length is below 30 seconds. To generate audio longer than 30 seconds, local deployment is recommended, github repo.
 """
 	<p style="font-size: 18px;margin-left: 20px;"><a href="https://github.com/FunAudioLLM/InspireMusic" target="_blank">Code</a> </p>
 	<p style="font-size: 18px;margin-left: 20px;"><a href="https://iris2c.github.io/InspireMusic" target="_blank">Demo</a></p>
 	<h2 style="font-size: 22px;margin-left: 0px;">Models</h2>
+	<p style="font-size: 18px;margin-left: 20px;"><a href="https://modelscope.cn/models/iic/InspireMusic-1.5B-Long/summary" target="_blank">Modelscope Model</a>: </p>
+	<p style="font-size: 18px;margin-left: 20px;"><a href="https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-Long" target="_blank">Huggingface Model</a></p>
 </div>
 """
+def music_generation(task, text=None, audio=None):
+    args = get_args()
+    args.task = task
+    args.text = text if text
+    args.audio_prompt = audio if audio
+    generate_audio_path = InspireMusic(args)
+    return generate_audio_path
+demo = gr.Blocks()
+t2m_demo = gr.Interface(
+    fn=music_generation,
+    inputs = [
+        gr.Dropdown(["Text-To-Music"], value="text-to-music", multiselect=False, info="Choose a task."),
+		gr.Text(label="Input Text"),
+    ],
+    outputs = [
+        gr.Audio(label="Generated Music", type="generated audio filepath"),
+    ],
+    title = "<a href='https://github.com/FunAudioLLM/InspireMusic' target='_blank'>InspireMusic<a/>: A Unified Framework for Music, Song, Audio Generation.",
+    description = ("InspireMusic ([Github Repo](https://github.com/FunAudioLLM/InspireMusic)) is a fundamental AIGC toolkit and models designed for music, song, and audio generation using PyTorch."
+                   "To try it, simply type text to generation music, or click one of the examples. "),
+    article = ("<p style='text-align: center'><a href='' target='_blank'>InspireMusic</a> </p>"
+              "<p style='text-align: center'><a href='https://openreview.net/forum?id=yBlVlS2Fd9' target='_blank'>WavTokenizer: an Efficient Acoustic Discrete Codec Tokenizer for Audio Language Modeling</a> </p>"),
+    examples = [
+        ["example/inspiremusic/inspiremusic_01.wav", "24000 Hz"],
+        ["example/ras/chorus/chorus_01.wav", "48000 Hz"],
+    ],
+    cache_examples = True,
+)
+con_demo = gr.Interface(
+    fn=music_generation,
+    inputs = [
+        gr.Dropdown(["Music Continuation"], value="continuation", multiselect=False, info="Choose a task."),
+		gr.Text(label="Input Text"),
+        gr.Audio(label="Input Audio Prompt", type="audio prompt filepath"),
+    ],
+    outputs = [
+        gr.Audio(label="Generated Music", type="generated audio filepath"),
+    ],
+    title = "<a href='https://github.com/FunAudioLLM/InspireMusic' target='_blank'>InspireMusic<a/>: A Unified Framework for Music, Song, Audio Generation.",
+    description = ("InspireMusic ([Github Repo](https://github.com/FunAudioLLM/InspireMusic)) is a fundamental AIGC toolkit and models designed for music, song, and audio generation using PyTorch."
+                   "To try it, simply type text to generation music, or click one of the examples. "),
+    article = ("<p style='text-align: center'><a href='' target='_blank'>InspireMusic</a> </p>"
+              "<p style='text-align: center'><a href='https://openreview.net/forum?id=yBlVlS2Fd9' target='_blank'>WavTokenizer: an Efficient Acoustic Discrete Codec Tokenizer for Audio Language Modeling</a> </p>"),
+    examples = [
+        ["example/inspiremusic/inspiremusic_01.wav", "24000 Hz"],
+        ["example/ras/chorus/chorus_01.wav", "48000 Hz"],
+    ],
+    cache_examples = True,
+)
+con_demo = gr.Interface(
+    fn=music_generation,
+    inputs = [
+        gr.Dropdown(["Music Continuation"], value="continuation", multiselect=False, info="Choose a task."),
+		gr.Text(label="Input Text"),
+        gr.Audio(label="Input Audio Prompt", type="audio prompt filepath"),
+    ],
+    outputs = [
+        gr.Audio(label="Generated Music", type="generated audio filepath"),
+    ],
+    title = "<a href='https://github.com/FunAudioLLM/InspireMusic' target='_blank'>InspireMusic<a/>: A Unified Framework for Music, Song, Audio Generation.",
+    description = ("InspireMusic ([Github Repo](https://github.com/FunAudioLLM/InspireMusic)) is a fundamental AIGC toolkit and models designed for music, song, and audio generation using PyTorch."
+                   "To try it, simply type text to generation music, or click one of the examples. "),
+    article = ("<p style='text-align: center'><a href='' target='_blank'>InspireMusic</a> </p>"
+              "<p style='text-align: center'><a href='https://openreview.net/forum?id=yBlVlS2Fd9' target='_blank'>WavTokenizer: an Efficient Acoustic Discrete Codec Tokenizer for Audio Language Modeling</a> </p>"),
+    examples = [
+        ["example/inspiremusic/inspiremusic_01.wav", "24000 Hz"],
+        ["example/ras/chorus/chorus_01.wav", "48000 Hz"],
+    ],
+    cache_examples = True,
+)
+with demo:
+    gr.TabbedInterface([t2m_demo, con_demo,],
+                       ["Task 1: Text-to-Music",
+                        "Task 2: Music Continuation"])
+	# gr.TabbedInterface([t2m_demo, con_demo, fast_demo], ["Task 1: Text-to-Music", "Task 2: Music Continuation", "Task 3: Without Flow Matching"])
+demo.launch()

inspiremusic/__init__.py ADDED Viewed

File without changes

inspiremusic/bin/export_jit.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import os
+import sys
+import torch
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append('{}/../..'.format(ROOT_DIR))
+sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
+from inspiremusic.cli.inspiremusic import InspireMusic
+def get_args():
+    parser = argparse.ArgumentParser(description='export your model for deployment')
+    parser.add_argument('--model_dir',
+                        type=str,
+                        default='pretrained_models/InspireMusic',
+                        help='local path')
+    args = parser.parse_args()
+    print(args)
+    return args
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    torch._C._jit_set_fusion_strategy([('STATIC', 1)])
+    torch._C._jit_set_profiling_mode(False)
+    torch._C._jit_set_profiling_executor(False)
+    inspiremusic = InspireMusic(args.model_dir, load_jit=False, load_onnx=False)
+    # 1. export llm text_encoder
+    llm_text_encoder = inspiremusic.model.llm.text_encoder.half()
+    script = torch.jit.script(llm_text_encoder)
+    script = torch.jit.freeze(script)
+    script = torch.jit.optimize_for_inference(script)
+    script.save('{}/llm.text_encoder.fp16.zip'.format(args.model_dir))
+    # 2. export llm llm
+    llm_llm = inspiremusic.model.llm.llm.half()
+    script = torch.jit.script(llm_llm)
+    script = torch.jit.freeze(script, preserved_attrs=['forward_chunk'])
+    script = torch.jit.optimize_for_inference(script)
+    script.save('{}/llm.llm.fp16.zip'.format(args.model_dir))
+    # 3. export flow encoder
+    flow_encoder = inspiremusic.model.flow.encoder
+    script = torch.jit.script(flow_encoder)
+    script = torch.jit.freeze(script)
+    script = torch.jit.optimize_for_inference(script)
+    script.save('{}/flow.encoder.fp32.zip'.format(args.model_dir))
+if __name__ == '__main__':
+    main()

inspiremusic/bin/export_onnx.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# Copyright (c) 2024 Antgroup Inc (authors: Zhoubofan, [email protected])
+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import os
+import sys
+import onnxruntime
+import random
+import torch
+from tqdm import tqdm
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append('{}/../..'.format(ROOT_DIR))
+sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
+from inspiremusic.cli.inspiremusic import InspireMusic
+def get_dummy_input(batch_size, seq_len, out_channels, device):
+    x = torch.rand((batch_size, out_channels, seq_len), dtype=torch.float32, device=device)
+    mask = torch.ones((batch_size, 1, seq_len), dtype=torch.float32, device=device)
+    mu = torch.rand((batch_size, out_channels, seq_len), dtype=torch.float32, device=device)
+    t = torch.rand((batch_size), dtype=torch.float32, device=device)
+    spks = torch.rand((batch_size, out_channels), dtype=torch.float32, device=device)
+    cond = torch.rand((batch_size, out_channels, seq_len), dtype=torch.float32, device=device)
+    return x, mask, mu, t, spks, cond
+def get_args():
+    parser = argparse.ArgumentParser(description='export your model for deployment')
+    parser.add_argument('--model_dir',
+                        type=str,
+                        default='pretrained_models/InspireMusic',
+                        help='local path')
+    args = parser.parse_args()
+    print(args)
+    return args
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    inspiremusic = InspireMusic(args.model_dir, load_jit=False, load_onnx=False)
+    # 1. export flow decoder estimator
+    estimator = inspiremusic.model.flow.decoder.estimator
+    device = inspiremusic.model.device
+    batch_size, seq_len = 1, 256
+    out_channels = inspiremusic.model.flow.decoder.estimator.out_channels
+    x, mask, mu, t, spks, cond = get_dummy_input(batch_size, seq_len, out_channels, device)
+    torch.onnx.export(
+        estimator,
+        (x, mask, mu, t, spks, cond),
+        '{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
+        export_params=True,
+        opset_version=18,
+        do_constant_folding=True,
+        input_names=['x', 'mask', 'mu', 't', 'spks', 'cond'],
+        output_names=['estimator_out'],
+        dynamic_axes={
+            'x': {0: 'batch_size', 2: 'seq_len'},
+            'mask': {0: 'batch_size', 2: 'seq_len'},
+            'mu': {0: 'batch_size', 2: 'seq_len'},
+            'cond': {0: 'batch_size', 2: 'seq_len'},
+            't': {0: 'batch_size'},
+            'spks': {0: 'batch_size'},
+            'estimator_out': {0: 'batch_size', 2: 'seq_len'},
+        }
+    )
+    # 2. test computation consistency
+    option = onnxruntime.SessionOptions()
+    option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+    option.intra_op_num_threads = 1
+    providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
+    estimator_onnx = onnxruntime.InferenceSession('{}/flow.decoder.estimator.fp32.onnx'.format(args.model_dir),
+                                                  sess_options=option, providers=providers)
+    for _ in tqdm(range(10)):
+        x, mask, mu, t, spks, cond = get_dummy_input(random.randint(1, 6), random.randint(16, 512), out_channels, device)
+        output_pytorch = estimator(x, mask, mu, t, spks, cond)
+        ort_inputs = {
+            'x': x.cpu().numpy(),
+            'mask': mask.cpu().numpy(),
+            'mu': mu.cpu().numpy(),
+            't': t.cpu().numpy(),
+            'spks': spks.cpu().numpy(),
+            'cond': cond.cpu().numpy()
+        }
+        output_onnx = estimator_onnx.run(None, ort_inputs)[0]
+        torch.testing.assert_allclose(output_pytorch, torch.from_numpy(output_onnx).to(device), rtol=1e-2, atol=1e-4)
+if __name__ == "__main__":
+    main()

inspiremusic/bin/flow_only_infer.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import os
+import torch
+from torch.utils.data import DataLoader
+import torchaudio
+from hyperpyyaml import load_hyperpyyaml
+from tqdm import tqdm
+from inspiremusic.cli.model import InspireMusicModel
+from inspiremusic.dataset.dataset import Dataset
+from inspiremusic.utils.common import MUSIC_STRUCTURE_LABELS
+def get_args():
+    parser = argparse.ArgumentParser(description='inference only with flow model')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--prompt_data', required=True, help='prompt data file')
+    parser.add_argument('--flow_model', required=True, help='flow model file')
+    parser.add_argument('--llm_model', default=None,required=False, help='llm model file')
+    parser.add_argument('--music_tokenizer', required=True, help='music tokenizer model file')
+    parser.add_argument('--wavtokenizer', required=True, help='wavtokenizer model file')
+    parser.add_argument('--chorus', default="random",required=False, help='chorus tag generation mode, eg. random, verse, chorus, intro.')
+    parser.add_argument('--sample_rate', type=int, default=48000, required=False,
+                        help='sampling rate of generated audio')
+    parser.add_argument('--min_generate_audio_seconds', type=float, default=10.0, required=False,
+                        help='the minimum generated audio length in seconds')
+    parser.add_argument('--max_generate_audio_seconds', type=float, default=30.0, required=False,
+                        help='the maximum generated audio length in seconds')
+    parser.add_argument('--gpu',
+                        type=int,
+                        default=-1,
+                        help='gpu id for this rank, -1 for cpu')
+    parser.add_argument('--result_dir', required=True, help='asr result file')
+    args = parser.parse_args()
+    print(args)
+    return args
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
+    # Init inspiremusic models from configs
+    use_cuda = args.gpu >= 0 and torch.cuda.is_available()
+    device = torch.device('cuda' if use_cuda else 'cpu')
+    with open(args.config, 'r') as f:
+        configs = load_hyperpyyaml(f)
+    model = InspireMusicModel(None, configs['flow'], configs['hift'], configs['wavtokenizer'])
+    model.load(args.llm_model, args.flow_model, args.music_tokenizer, args.wavtokenizer)
+    if args.llm_model is None:
+        model.llm = None
+    else:
+        model.llm = model.llm.to(torch.float32)
+    if args.flow_model is None:
+        model.flow = None
+    test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=True, partition=False)
+    test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0)
+    del configs
+    os.makedirs(args.result_dir, exist_ok=True)
+    fn = os.path.join(args.result_dir, 'wav.scp')
+    f = open(fn, 'w')
+    with torch.no_grad():
+        for _, batch in tqdm(enumerate(test_data_loader)):
+            utts = batch["utts"]
+            assert len(utts) == 1, "inference mode only support batchsize 1"
+            if "semantic_token" in batch:
+                token  = batch["semantic_token"].to(device)
+                token_len  = batch["semantic_token_len"].to(device)
+            else:
+                if audio_token is None:
+                    token = None
+                    token_len = None
+                else:
+                    token = audio_token.view(audio_token.size(0),-1,4)[:,:,0]
+                    token_len  = audio_token_len / 4
+            text_token = batch["text_token"].to(device)
+            text_token_len = batch["text_token_len"].to(device)
+            text = batch["text"]
+            if "time_start" not in batch.keys():
+                batch["time_start"] = torch.randint(0, args.min_generate_audio_seconds, (1,)).to(torch.float64)
+            if "time_end" not in batch.keys():
+                batch["time_end"] = torch.randint(args.min_generate_audio_seconds, args.max_generate_audio_seconds, (1,)).to(torch.float64)
+            elif (batch["time_end"].numpy()[0] - batch["time_start"].numpy()[0]) < args.min_generate_audio_seconds:
+                batch["time_end"] = torch.randint(int(batch["time_start"].numpy()[0] + args.min_generate_audio_seconds), int(batch["time_start"].numpy()[0] + args.max_generate_audio_seconds), (1,)).to(torch.float64)
+            if "chorus" not in batch.keys():
+                batch["chorus"] = torch.randint(1, 5, (1,))
+            if args.chorus == "random":
+                batch["chorus"] = torch.randint(1, 5, (1,))
+            elif args.chorus == "intro":
+                batch["chorus"] = torch.Tensor([0])
+            elif "verse" in args.chorus:
+                batch["chorus"] = torch.Tensor([1])
+            elif args.chorus == "chorus":
+                batch["chorus"] = torch.Tensor([2])
+            elif args.chorus == "outro":
+                batch["chorus"] = torch.Tensor([4])
+            time_start = batch["time_start"].to(device)
+            time_end = batch["time_end"].to(device)
+            chorus = batch["chorus"].to(torch.int)
+            text_prompt = f"<|{batch['time_start'].numpy()[0]}|><|{MUSIC_STRUCTURE_LABELS[chorus.numpy()[0]]}|><|{batch['text'][0]}|><|{batch['time_end'].numpy()[0]}|>"
+            chorus = chorus.to(device)
+            model_input = {"text": text, "audio_token": token, "audio_token_len": token_len,
+                                "text_token": text_token, "text_token_len": text_token_len,
+                                "embeddings": [time_start, time_end, chorus], "raw_text":text}
+            music_audios = []
+            for model_output in model.inference(**model_input):
+                music_audios.append(model_output['music_audio'])
+            music_key = utts[0]
+            music_fn = os.path.join(args.result_dir, '{}.wav'.format(music_key))
+            torchaudio.save(music_fn, music_audios[0], sample_rate=args.sample_rate)
+            f.write('{} {}\n'.format(music_key, music_fn))
+            f.flush()
+    f.close()
+    logging.info('Result wav.scp saved in {}'.format(fn))
+if __name__ == '__main__':
+    main()

inspiremusic/bin/inference.py ADDED Viewed

	@@ -0,0 +1,266 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+import os
+import torch
+from torch.utils.data import DataLoader
+import torchaudio
+from hyperpyyaml import load_hyperpyyaml
+from tqdm import tqdm
+from inspiremusic.cli.model import InspireMusicModel
+from inspiremusic.dataset.dataset import Dataset
+import time
+from inspiremusic.utils.audio_utils import trim_audio, fade_out, process_audio
+from inspiremusic.utils.common import MUSIC_STRUCTURE_LABELS
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+def get_args():
+    parser = argparse.ArgumentParser(description='inference only with your model')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--prompt_data', required=True, help='prompt data file')
+    parser.add_argument('--flow_model', default=None, required=False, help='flow model file')
+    parser.add_argument('--llm_model', default=None,required=False, help='flow model file')
+    parser.add_argument('--music_tokenizer', required=True, help='music tokenizer model file')
+    parser.add_argument('--wavtokenizer', required=True, help='wavtokenizer model file')
+    parser.add_argument('--chorus', default="random",required=False, help='chorus tag generation mode, eg. random, verse, chorus, intro.')
+    parser.add_argument('--fast', action='store_true', required=False, help='True: fast inference mode, without flow matching for fast inference. False: normal inference mode, with flow matching for high quality.')
+    parser.add_argument('--fp16', default=True, type=bool, required=False, help='inference with fp16 model')
+    parser.add_argument('--fade_out', default=True, type=bool, required=False, help='add fade out effect to generated audio')
+    parser.add_argument('--fade_out_duration', default=1.0, type=float, required=False, help='fade out duration in seconds')
+    parser.add_argument('--trim', default=False, type=bool, required=False, help='trim the silence ending of generated audio')
+    parser.add_argument('--format', type=str, default="wav", required=False,
+                        choices=["wav", "mp3", "m4a", "flac"],
+                        help='sampling rate of input audio')
+    parser.add_argument('--sample_rate', type=int, default=24000, required=False,
+                        help='sampling rate of input audio')
+    parser.add_argument('--output_sample_rate', type=int, default=48000, required=False, choices=[24000, 48000],
+                        help='sampling rate of generated output audio')
+    parser.add_argument('--min_generate_audio_seconds', type=float, default=10.0, required=False,
+                        help='the minimum generated audio length in seconds')
+    parser.add_argument('--max_generate_audio_seconds', type=float, default=30.0, required=False,
+                        help='the maximum generated audio length in seconds')
+    parser.add_argument('--gpu',
+                        type=int,
+                        default=0,
+                        help='gpu id for this rank, -1 for cpu')
+    parser.add_argument('--task',
+                        default='text-to-music',
+                        choices=['text-to-music', 'continuation', "reconstruct", "super_resolution"],
+                        help='choose inference task type. text-to-music: text-to-music task. continuation: music continuation task. reconstruct: reconstruction of original music. super_resolution: convert original 24kHz music into 48kHz music.')
+    parser.add_argument('--result_dir', required=True, help='asr result file')
+    args = parser.parse_args()
+    print(args)
+    return args
+def main():
+	args = get_args()
+	logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s')
+	os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
+	if args.fast:
+		args.output_sample_rate = 24000
+	min_generate_audio_length = int(args.output_sample_rate * args.min_generate_audio_seconds)
+	max_generate_audio_length = int(args.output_sample_rate * args.max_generate_audio_seconds)
+	assert args.min_generate_audio_seconds <= args.max_generate_audio_seconds
+	# Init inspiremusic models from configs
+	use_cuda = args.gpu >= 0 and torch.cuda.is_available()
+	device = torch.device('cuda' if use_cuda else 'cpu')
+	with open(args.config, 'r') as f:
+		configs = load_hyperpyyaml(f)
+	model = InspireMusicModel(configs['llm'], configs['flow'], configs['hift'], configs['wavtokenizer'], args.fast, args.fp16)
+	model.load(args.llm_model, args.flow_model, args.music_tokenizer, args.wavtokenizer)
+	if args.llm_model is None:
+		model.llm = None
+	else:
+		model.llm = model.llm.to(torch.float32)
+	if args.flow_model is None:
+		model.flow = None
+	test_dataset = Dataset(args.prompt_data, data_pipeline=configs['data_pipeline'], mode='inference', shuffle=True, partition=False)
+	test_data_loader = DataLoader(test_dataset, batch_size=None, num_workers=0)
+	del configs
+	os.makedirs(args.result_dir, exist_ok=True)
+	fn = os.path.join(args.result_dir, 'wav.scp')
+	f = open(fn, 'w')
+	caption_fn = os.path.join(args.result_dir, 'captions.txt')
+	caption_f = open(caption_fn, 'w')
+	with torch.no_grad():
+		for _, batch in tqdm(enumerate(test_data_loader)):
+			utts = batch["utts"]
+			assert len(utts) == 1, "inference mode only support batchsize 1"
+			text_token = batch["text_token"].to(device)
+			text_token_len = batch["text_token_len"].to(device)
+			if "time_start" not in batch.keys():
+				batch["time_start"] = torch.randint(0, args.min_generate_audio_seconds, (1,)).to(torch.float64)
+			if batch["time_start"].numpy()[0] > 300:
+				batch["time_start"] = torch.Tensor([0]).to(torch.float64)
+			if "time_end" not in batch.keys():
+				batch["time_end"] = torch.randint(int(batch["time_start"].numpy()[0] + args.min_generate_audio_seconds), int(batch["time_start"].numpy()[0] + args.max_generate_audio_seconds), (1,)).to(torch.float64)
+			else:
+				if (batch["time_end"].numpy()[0] - batch["time_start"].numpy()[0]) < args.min_generate_audio_seconds:
+					batch["time_end"] = torch.randint(int(batch["time_start"].numpy()[0] + args.min_generate_audio_seconds), int(batch["time_start"].numpy()[0] + args.max_generate_audio_seconds), (1,)).to(torch.float64)
+				elif (batch["time_end"].numpy()[0] - batch["time_start"].numpy()[0]) > args.max_generate_audio_seconds:
+					batch["time_end"] = torch.Tensor([(batch["time_start"].numpy()[0] + args.max_generate_audio_seconds)]).to(torch.float64)
+			if "chorus" not in batch.keys():
+				batch["chorus"] = torch.randint(1, 5, (1,))
+			if args.chorus == "random":
+				batch["chorus"] = torch.randint(1, 5, (1,))
+			elif args.chorus == "intro":
+				batch["chorus"] = torch.Tensor([0])
+			elif "verse" in args.chorus:
+				batch["chorus"] = torch.Tensor([1])
+			elif args.chorus == "chorus":
+				batch["chorus"] = torch.Tensor([2])
+			elif args.chorus == "outro":
+				batch["chorus"] = torch.Tensor([4])
+			else:
+				batch["chorus"] = batch["chorus"]
+			time_start = batch["time_start"].to(device)
+			time_end = batch["time_end"].to(device)
+			chorus = batch["chorus"].to(torch.int)
+			text_prompt = f"<|{batch['time_start'].numpy()[0]}|><|{MUSIC_STRUCTURE_LABELS[chorus.numpy()[0]]}|><|{batch['text'][0]}|><|{batch['time_end'].numpy()[0]}|>"
+			chorus = chorus.to(device)
+			if batch["acoustic_token"] is None:
+				audio_token = None
+				audio_token_len = None
+			else:
+				audio_token = batch["acoustic_token"].to(device)
+				audio_token_len = batch["acoustic_token_len"].to(device)
+			text = batch["text"]
+			if "semantic_token" in batch:
+				token = batch["semantic_token"].to(device)
+				token_len = batch["semantic_token_len"].to(device)
+			else:
+				if audio_token is None:
+					token = None
+					token_len = None
+				else:
+					token = audio_token.view(audio_token.size(0), -1, 4)[:, :, 0]
+					token_len = audio_token_len / 4
+			if args.task in ['text-to-music', 'continuation']:
+				# text to music, music continuation
+				model_input = {"text": text, "audio_token": token,
+							   "audio_token_len": token_len,
+							   "text_token": text_token,
+							   "text_token_len": text_token_len,
+							   "embeddings": [time_start, time_end, chorus],
+							   "raw_text": text,
+							   "sample_rate": args.output_sample_rate,
+							   "duration_to_gen": args.max_generate_audio_seconds,
+							   "task": args.task}
+			elif args.task in ['reconstruct', 'super_resolution']:
+				# audio reconstruction, audio super resolution
+				model_input = {"text": text, "audio_token": audio_token,
+							   "audio_token_len": audio_token_len,
+							   "text_token": text_token,
+							   "text_token_len": text_token_len,
+							   "embeddings": [time_start, time_end, chorus],
+							   "raw_text": text,
+							   "sample_rate": args.output_sample_rate,
+							   "duration_to_gen": args.max_generate_audio_seconds,
+							   "task": args.task}
+			else:
+				# zero-shot
+				model_input = {'text'                       : text,
+							   'text_len'                   : text_token_len,
+							   'prompt_text'                : text_token,
+							   'prompt_text_len'            : text_token_len,
+							   'llm_prompt_audio_token'     : token,
+							   'llm_prompt_audio_token_len' : token_len,
+							   'flow_prompt_audio_token'    : audio_token,
+							   'flow_prompt_audio_token_len': audio_token_len,
+							   'prompt_audio_feat'          : audio_feat,
+							   'prompt_audio_feat_len'      : audio_feat_len,
+							   "embeddings"                 : [time_start,
+															   time_end,
+															   chorus]}
+			music_key = utts[0]
+			music_audios = []
+			music_fn = os.path.join(args.result_dir, f'{music_key}.{args.format}')
+			bench_start = time.time()
+			for model_output in model.inference(**model_input):
+				music_audios.append(model_output['music_audio'])
+			bench_end = time.time()
+			if args.trim:
+				music_audio = trim_audio(music_audios[0],
+										 sample_rate=args.output_sample_rate,
+										 threshold=0.05,
+										 min_silence_duration=0.8)
+			else:
+				music_audio = music_audios[0]
+			if music_audio.shape[0] != 0:
+				if music_audio.shape[1] > max_generate_audio_length:
+					music_audio = music_audio[:, :max_generate_audio_length]
+				if music_audio.shape[1] >= min_generate_audio_length:
+					try:
+						if args.fade_out:
+							music_audio = fade_out(music_audio, args.output_sample_rate, args.fade_out_duration)
+						music_audio = music_audio.repeat(2, 1)
+						if args.format in ["wav", "flac"]:
+							torchaudio.save(music_fn, music_audio, sample_rate=args.output_sample_rate, encoding="PCM_S", bits_per_sample=24)
+						elif args.format in ["mp3", "m4a"]:
+							torchaudio.backend.sox_io_backend.save(filepath=music_fn, src=music_audio, sample_rate=args.output_sample_rate, format=args.format)
+						else:
+							logging.info(f"Format is not supported. Please choose from wav, mp3, m4a, flac.")
+					except Exception as e:
+						logging.info(f"Error saving file: {e}")
+						raise
+					audio_duration = music_audio.shape[1] / args.output_sample_rate
+					rtf = (bench_end - bench_start) / audio_duration
+					logging.info(f"processing time: {int(bench_end - bench_start)}s, audio length: {int(audio_duration)}s, rtf: {rtf}, text prompt: {text_prompt}")
+					f.write('{} {}\n'.format(music_key, music_fn))
+					f.flush()
+					caption_f.write('{}\t{}\n'.format(music_key, text_prompt))
+					caption_f.flush()
+				else:
+					logging.info(f"Generate audio length {music_audio.shape[1]} is shorter than min_generate_audio_length.")
+			else:
+				logging.info(f"Generate audio is empty, dim = {music_audio.shape[0]}.")
+	f.close()
+	logging.info('Result wav.scp saved in {}'.format(fn))
+if __name__ == '__main__':
+	main()

inspiremusic/bin/train.py ADDED Viewed

	@@ -0,0 +1,194 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import datetime
+import logging
+logging.getLogger('matplotlib').setLevel(logging.WARNING)
+from copy import deepcopy
+import torch
+import torch.distributed as dist
+import deepspeed
+import glob
+import os
+from hyperpyyaml import load_hyperpyyaml
+from torch.cuda.amp import GradScaler, autocast
+from torch.distributed.elastic.multiprocessing.errors import record
+from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
+from inspiremusic.utils.executor import Executor
+from inspiremusic.utils.train_utils import (
+    init_distributed,
+    init_dataset_and_dataloader,
+    init_optimizer_and_scheduler,
+    init_summarywriter, save_model,
+    wrap_cuda_model, check_modify_and_save_config)
+def get_args():
+    parser = argparse.ArgumentParser(description='training your network')
+    parser.add_argument('--train_engine',
+                        default='torch_ddp',
+                        choices=['torch_ddp', 'deepspeed'],
+                        help='Engine for paralleled training')
+    parser.add_argument('--model', required=True, help='model which will be trained')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--train_data', required=True, help='train data file')
+    parser.add_argument('--cv_data', required=True, help='cv data file')
+    parser.add_argument('--checkpoint', help='checkpoint model')
+    parser.add_argument('--model_dir', required=True, help='save model dir')
+    parser.add_argument('--tensorboard_dir',
+                        default='tensorboard',
+                        help='tensorboard log dir')
+    parser.add_argument('--ddp.dist_backend',
+                        dest='dist_backend',
+                        default='nccl',
+                        choices=['nccl', 'gloo'],
+                        help='distributed backend')
+    parser.add_argument('--num_workers',
+                        default=0,
+                        type=int,
+                        help='number of subprocess workers for reading')
+    parser.add_argument('--prefetch',
+                        default=100,
+                        type=int,
+                        help='prefetch number')
+    parser.add_argument('--pin_memory',
+                        action='store_true',
+                        default=True,
+                        help='Use pinned memory buffers used for reading')
+    parser.add_argument('--deepspeed.save_states',
+                        dest='save_states',
+                        default='model_only',
+                        choices=['model_only', 'model+optimizer'],
+                        help='save model/optimizer states')
+    parser.add_argument('--timeout',
+                        default=30,
+                        type=int,
+                        help='timeout (in seconds) of inspiremusic_join.')
+    parser.add_argument('--fp16',
+                          action='store_true',
+                          default=False,
+                          help='Enable fp16 mixed precision training')
+    parser.add_argument('--lora',
+                          action='store_true',
+                          default=False,
+                          help='Enable LoRA training')
+    parser.add_argument('--lora_rank',
+                          default=4,
+                          type=int,
+                          help='LoRA rank')
+    parser.add_argument('--lora_alpha',
+                          default=16,
+                          type=int,
+                          help='LoRA alpha')
+    parser.add_argument('--lora_dropout',
+                          default=0.1,
+                          type=float,
+                          help='LoRA dropout rate')
+    parser.add_argument('--lora_target_modules',
+                          nargs='+',
+                          default=["k_proj","v_proj"],
+                          help='Target modules to apply LoRA (e.g., ["q_proj", "v_proj"])')
+    parser = deepspeed.add_config_arguments(parser)
+    args = parser.parse_args()
+    return args
+@record
+def main():
+    args = get_args()
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    override_dict = {k: None for k in ['llm', 'flow', 'hift'] if k != args.model}
+    with open(args.config, 'r') as f:
+        configs = load_hyperpyyaml(f, overrides=override_dict)
+    configs['train_conf'].update(vars(args))
+    # Init env for ddp
+    init_distributed(args)
+    # Get dataset & dataloader
+    train_dataset, cv_dataset, train_data_loader, cv_data_loader = \
+        init_dataset_and_dataloader(args, configs)
+    # Do some sanity checks and save config to arsg.model_dir
+    configs = check_modify_and_save_config(args, configs)
+    # Tensorboard summary
+    writer = init_summarywriter(args)
+    # load checkpoint
+    model = configs[args.model]
+    if args.checkpoint is not None:
+        model.load_state_dict(torch.load(args.checkpoint, map_location='cpu'))
+    else:
+        # Find and load the latest checkpoint
+        checkpoint_files = glob.glob(os.path.join(args.model_dir, '*.pt'))
+        if checkpoint_files:
+            latest_checkpoint = max(checkpoint_files, key=os.path.getctime)
+            logging.info(f"Loaded latest checkpoint from {latest_checkpoint}")
+            model.load_state_dict(torch.load(latest_checkpoint, map_location='cpu'))
+    if args.lora:
+        logging.info("Applying LoRA to the model...")
+        if not args.lora_target_modules:
+            raise ValueError("No target modules specified for LoRA. Please provide --lora_target_modules.")
+        lora_config = LoraConfig(
+            task_type="CAUSAL_LM",  # Change to appropriate task type
+            inference_mode=False,
+            r=args.lora_rank,
+            lora_alpha=args.lora_alpha,
+            lora_dropout=args.lora_dropout,
+            target_modules=args.lora_target_modules
+        )
+        model.llm.model = get_peft_model(model.llm.model, lora_config)
+        # Optionally freeze the base model
+    else:
+        logging.info("LoRA is not enabled. Training the full model.")
+    # Dispatch model from cpu to gpu
+    model = wrap_cuda_model(args, model)
+    # Get optimizer & scheduler
+    model, optimizer, scheduler = init_optimizer_and_scheduler(args, configs, model)
+    # Initialize AMP for torch_ddp if fp16 is enabled
+    scaler = None
+    if args.fp16:
+        scaler = GradScaler()
+        logging.info("Initialized AMP GradScaler for mixed precision training.")
+    # Save init checkpoints
+    info_dict = deepcopy(configs['train_conf'])
+    # Get executor
+    executor = Executor()
+    # Start training loop
+    for epoch in range(info_dict['max_epoch']):
+        executor.epoch = epoch
+        train_dataset.set_epoch(epoch)
+        dist.barrier()
+        group_join = dist.new_group(backend="gloo", timeout=datetime.timedelta(seconds=args.timeout))
+        executor.train_one_epoch(model, optimizer, scheduler, train_data_loader, cv_data_loader, writer, info_dict, group_join, scaler=scaler)
+        dist.destroy_process_group(group_join)
+if __name__ == '__main__':
+    main()

inspiremusic/cli/__init__.py ADDED Viewed

File without changes

inspiremusic/cli/frontend.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+import onnxruntime
+import torch
+import numpy as np
+import whisper
+from typing import Callable
+import torchaudio.compliance.kaldi as kaldi
+import torchaudio
+import os
+import re
+import inflect
+from inspiremusic.cli.model import InspireMusicModel
+from inspiremusic.utils.frontend_utils import contains_chinese, replace_blank, replace_corner_mark, remove_bracket, spell_out_number, split_paragraph
+from inspiremusic.wavtokenizer.decoder.pretrained import WavTokenizer
+class InspireMusicFrontEnd:
+    def __init__(self,
+                 configs: Callable,
+                 get_tokenizer: Callable,
+                 llm_model: str,
+                 flow_model: str,
+                 music_tokenizer_dir: str,
+                 audio_tokenizer_dir: str,
+                 instruct: bool = False,
+                 fast: bool = False,
+                 fp16: bool = True,
+                 allowed_special: str = 'all'):
+        self.tokenizer = get_tokenizer()
+        self.audio_tokenizer_dir = audio_tokenizer_dir
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.bandwidth_id = torch.tensor([0]).to(self.device)
+        self.wavtokenizer = WavTokenizer.from_pretrained_feat(f"{audio_tokenizer_dir}/config.yaml", f"{audio_tokenizer_dir}/model.pt").to(self.device)
+        self.model = InspireMusicModel(configs['llm'], configs['flow'], configs['hift'], configs['wavtokenizer'], fast, fp16)
+        self.model = self.model.load(llm_model, flow_model, music_tokenizer_dir, audio_tokenizer_dir)
+        self.instruct = instruct
+        self.allowed_special = allowed_special
+        self.inflect_parser = inflect.engine()
+    def _extract_text_token(self, text):
+        text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
+        text_token = torch.tensor([text_token], dtype=torch.int32).to(self.device)
+        text_token_len = torch.tensor([text_token.shape[1]], dtype=torch.int32).to(self.device)
+        return text_token, text_token_len
+    def _extract_audio_token(self, audio, sample_rate=24000):
+        audio = torch.tensor(audio, dtype=torch.float32, device=self.device)
+        _, audio_token = self.wavtokenizer.encode_infer(audio, bandwidth_id=self.bandwidth_id)
+        audio_token = audio_token.squeeze(0)
+        audio_token_len = torch.tensor([audio_token.shape[1]], dtype=torch.int32, device=self.device)
+        return audio_token, audio_token_len
+    def text_normalize(self, text, split=True):
+        text = text.strip()
+        if contains_chinese(text):
+            text = text.replace("\n", "")
+            text = replace_blank(text)
+            text = replace_corner_mark(text)
+            text = text.replace(".", "、")
+            text = text.replace(" - ", "，")
+            text = remove_bracket(text)
+            text = re.sub(r'[，,]+$', '。', text)
+            texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "zh", token_max_n=80,
+                                         token_min_n=60, merge_len=20, comma_split=False))
+        else:
+            text = spell_out_number(text, self.inflect_parser)
+            texts = list(split_paragraph(text, partial(self.tokenizer.encode, allowed_special=self.allowed_special), "en", token_max_n=80,
+                                         token_min_n=60, merge_len=20, comma_split=False))
+        if split is False:
+            return text
+        return texts
+    def frontend_text_to_music(self, text, time_start, time_end, chorus):
+        text_token, text_token_len = self._extract_text_token(text)
+        model_input = {"text": text, "audio_token": None, "audio_token_len": None,
+                                "text_token": text_token, "text_token_len": text_token_len,
+                                "embeddings": [time_start, time_end, chorus], "raw_text":text}
+        return model_input
+    def frontend_continuation(self, text, audio, time_start, time_end, chorus, target_sr=24000):
+        if text is None:
+            text_token = None
+            text_token_len = None
+        else:
+            text_token, text_token_len = self._extract_text_token(text)
+        audio_token, audio_token_len = self._extract_audio_token(audio, target_sr)
+        model_input = {"text": text, "audio_token": audio_token, "audio_token_len": audio_token_len,
+                                "text_token": text_token, "text_token_len": text_token_len,
+                                "embeddings": [time_start, time_end, chorus], "raw_text":text}
+        return model_input

inspiremusic/cli/inference.py ADDED Viewed

	@@ -0,0 +1,296 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+import torchaudio
+import time
+import logging
+import argparse
+from modelscope import snapshot_download
+from inspiremusic.cli.inspiremusic import InspireMusic
+from inspiremusic.utils.file_utils import logging
+import torch
+from inspiremusic.utils.audio_utils import trim_audio, fade_out
+from transformers import AutoModel
+def set_env_variables():
+    os.environ['PYTHONIOENCODING'] = 'UTF-8'
+    os.environ['TOKENIZERS_PARALLELISM'] = 'False'
+    current_working_dir = os.getcwd()
+    main_root = os.path.realpath(os.path.join(current_working_dir, '../../'))
+    bin_dir = os.path.join(main_root, 'inspiremusic')
+    third_party_matcha_tts_path = os.path.join(main_root, 'third_party', 'Matcha-TTS')
+    python_path = f"{main_root}:{bin_dir}:{third_party_matcha_tts_path}:{os.environ.get('PYTHONPATH', '')}"
+    os.environ['PYTHONPATH'] = python_path
+    sys.path.extend([main_root, third_party_matcha_tts_path])
+class InspireMusicUnified:
+    def __init__(self,
+                 model_name: str = "InspireMusic-1.5B-Long",
+                 model_dir: str = None,
+                 min_generate_audio_seconds: float = 10.0,
+                 max_generate_audio_seconds: float = 30.0,
+                 sample_rate: int = 24000,
+                 output_sample_rate: int = 48000,
+                 load_jit: bool = True,
+                 load_onnx: bool = False,
+                 fast: bool = False,
+                 fp16: bool = True,
+                 gpu: int = 0,
+                 result_dir: str = None):
+        os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu)
+        # Set model_dir or default to downloading if it doesn't exist
+        self.model_dir = model_dir or f"../../pretrained_models/{model_name}"
+        if not os.path.exists(self.model_dir):
+            self.model_dir = snapshot_download(f"iic/{model_name}", cache_dir=self.model_dir)
+        self.sample_rate = sample_rate
+        self.output_sample_rate = 24000 if fast else output_sample_rate
+        self.result_dir = result_dir or f"exp/{model_name}"
+        os.makedirs(self.result_dir, exist_ok=True)
+        self.min_generate_audio_seconds = min_generate_audio_seconds
+        self.max_generate_audio_seconds = max_generate_audio_seconds
+        self.min_generate_audio_length = int(self.output_sample_rate * self.min_generate_audio_seconds)
+        self.max_generate_audio_length = int(self.output_sample_rate * self.max_generate_audio_seconds)
+        assert self.min_generate_audio_seconds <= self.max_generate_audio_seconds, "Min audio seconds must be less than or equal to max audio seconds"
+        use_cuda = gpu >= 0 and torch.cuda.is_available()
+        self.device = torch.device('cuda' if use_cuda else 'cpu')
+        self.model = InspireMusic(self.model_dir, load_jit=load_jit, load_onnx=load_onnx, fast=fast, fp16=fp16)
+        logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+    @torch.inference_mode()
+    def inference(self,
+                  task: str = 'text-to-music',
+                  text: str = None,
+                  audio_prompt: str = None, # audio prompt file path
+                  chorus: str = "verse",
+                  time_start: float = 0.0,
+                  time_end: float = 30.0,
+                  output_fn: str = "output_audio",
+                  max_audio_prompt_length: float = 5.0,
+                  fade_out_duration: float = 1.0,
+                  output_format: str = "wav",
+                  fade_out_mode: bool = True,
+                  trim: bool = False,
+                  ):
+        with torch.no_grad():
+            text_prompt = f"<|{time_start}|><|{chorus}|><|{text}|><|{time_end}|>"
+            chorus_dict = {"random": torch.randint(1, 5, (1,)).item(), "intro" : 0, "verse": 1, "chorus": 2, "outro": 4}
+            chorus = chorus_dict.get(chorus, 1)
+            chorus = torch.tensor([chorus], dtype=torch.int).to(self.device)
+            time_start_tensor = torch.tensor([time_start], dtype=torch.float64).to(self.device)
+            time_end_tensor = torch.tensor([time_end], dtype=torch.float64).to(self.device)
+            music_fn = os.path.join(self.result_dir, f'{output_fn}.{output_format}')
+            bench_start = time.time()
+            if task == 'text-to-music':
+                model_input = {
+                    "text"           : text,
+                    "audio_prompt"   : audio_prompt,
+                    "time_start"     : time_start_tensor,
+                    "time_end"       : time_end_tensor,
+                    "chorus"         : chorus,
+                    "task"           : task,
+                    "stream"         : False,
+                    "duration_to_gen": self.max_generate_audio_seconds,
+                    "sr"             : self.sample_rate
+                }
+            elif task == 'continuation':
+                if audio_prompt is not None:
+                    audio, _ = process_audio(audio_prompt, self.sample_rate)
+                    if audio.size(1) < self.sample_rate:
+                        logging.warning("Warning: Input prompt audio length is shorter than 1s. Please provide an appropriate length audio prompt and try again.")
+                        audio = None
+                    else:
+                        max_audio_prompt_length_samples = int(max_audio_prompt_length * self.sample_rate)
+                        audio = audio[:, :max_audio_prompt_length_samples]  # Trimming prompt audio
+                model_input = {
+                    "text"           : text,
+                    "audio_prompt"   : audio,
+                    "time_start"     : time_start_tensor,
+                    "time_end"       : time_end_tensor,
+                    "chorus"         : chorus,
+                    "task"           : task,
+                    "stream"         : False,
+                    "duration_to_gen": self.max_generate_audio_seconds,
+                    "sr"             : self.sample_rate
+                }
+            music_audios = []
+            for model_output in self.model.cli_inference(**model_input):
+                music_audios.append(model_output['music_audio'])
+            bench_end = time.time()
+            if trim:
+                music_audio = trim_audio(music_audios[0],
+                                         sample_rate=self.output_sample_rate,
+                                         threshold=0.05,
+                                         min_silence_duration=0.8)
+            else:
+                music_audio = music_audios[0]
+            if music_audio.shape[0] != 0:
+                if music_audio.shape[1] > self.max_generate_audio_length:
+                    music_audio = music_audio[:, :self.max_generate_audio_length]
+                if music_audio.shape[1] >= self.min_generate_audio_length:
+                    try:
+                        if fade_out_mode:
+                            music_audio = fade_out(music_audio, self.output_sample_rate, fade_out_duration)
+                        music_audio = music_audio.repeat(2, 1)
+                        if output_format in ["wav", "flac"]:
+                            torchaudio.save(music_fn, music_audio,
+                                            sample_rate=self.output_sample_rate,
+                                            encoding="PCM_S",
+                                            bits_per_sample=24)
+                        elif output_format in ["mp3", "m4a"]:
+                            torchaudio.backend.sox_io_backend.save(
+                                filepath=music_fn, src=music_audio,
+                                sample_rate=self.output_sample_rate,
+                                format=output_format)
+                        else:
+                            logging.info("Format is not supported. Please choose from wav, mp3, m4a, flac.")
+                    except Exception as e:
+                        logging.error(f"Error saving file: {e}")
+                        raise
+                audio_duration = music_audio.shape[1] / self.output_sample_rate
+                rtf = (bench_end - bench_start) / audio_duration
+                logging.info(f"Processing time: {int(bench_end - bench_start)}s, audio length: {int(audio_duration)}s, rtf: {rtf}, text prompt: {text_prompt}")
+            else:
+                logging.error(f"Generated audio length is shorter than minimum required audio length.")
+def get_args():
+    parser = argparse.ArgumentParser(description='Run inference with your model')
+    parser.add_argument('-m', '--model_name', default="InspireMusic-1.5B-Long",
+                        help='Model name')
+    parser.add_argument('-d', '--model_dir',
+                        help='Model folder path')
+    parser.add_argument('-t', '--text', default="Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance.",
+                        help='Prompt text')
+    parser.add_argument('-a', '--audio_prompt', default=None,
+                        help='Prompt audio')
+    parser.add_argument('-c', '--chorus', default="intro",
+                        help='Chorus tag generation mode (e.g., random, verse, chorus, intro, outro)')
+    parser.add_argument('-f', '--fast', type=bool, default=False,
+                        help='Enable fast inference mode (without flow matching)')
+    parser.add_argument('-g', '--gpu', type=int, default=0,
+                        help='GPU ID for this rank, -1 for CPU')
+    parser.add_argument('--task', default='text-to-music', choices=['text-to-music', 'continuation', 'reconstruct', 'super_resolution'],
+                        help='Inference task type: text-to-music, continuation, reconstruct, super_resolution')
+    parser.add_argument('-r', '--result_dir', default="exp/inspiremusic",
+                        help='Directory to save generated audio')
+    parser.add_argument('-o', '--output_fn', default="output_audio",
+                        help='Output file name')
+    parser.add_argument('--format', type=str, default="wav", choices=["wav", "mp3", "m4a", "flac"],
+                        help='Format of output audio')
+    parser.add_argument('--sample_rate', type=int, default=24000,
+                        help='Sampling rate of input audio')
+    parser.add_argument('--output_sample_rate', type=int, default=48000, choices=[24000, 48000],
+                        help='Sampling rate of generated output audio')
+    parser.add_argument('-s', '--time_start', type=float, default=0.0,
+                        help='Start time in seconds')
+    parser.add_argument('-e', '--time_end', type=float, default=30.0,
+                        help='End time in seconds')
+    parser.add_argument('--max_audio_prompt_length', type=float, default=5.0,
+                        help='Maximum audio prompt length in seconds')
+    parser.add_argument('--min_generate_audio_seconds', type=float, default=10.0,
+                        help='Minimum generated audio length in seconds')
+    parser.add_argument('--max_generate_audio_seconds', type=float, default=30.0,
+                        help='Maximum generated audio length in seconds')
+    parser.add_argument('--fp16', type=bool, default=True,
+                        help='Inference with fp16 model')
+    parser.add_argument('--fade_out', type=bool, default=True,
+                        help='Apply fade out effect to generated audio')
+    parser.add_argument('--fade_out_duration', type=float, default=1.0,
+                        help='Fade out duration in seconds')
+    parser.add_argument('--trim', type=bool, default=False,
+                        help='Trim the silence ending of generated audio')
+    args = parser.parse_args()
+    if not args.model_dir:
+        args.model_dir = os.path.join("../../pretrained_models", args.model_name)
+    print(args)
+    return args
+def main():
+    set_env_variables()
+    args = get_args()
+    model = InspireMusicUnified(model_name = args.model_name,
+                 model_dir = args.model_dir,
+                 min_generate_audio_seconds = args.min_generate_audio_seconds,
+                 max_generate_audio_seconds = args.max_generate_audio_seconds,
+                 sample_rate = args.sample_rate,
+                 output_sample_rate = args.output_sample_rate,
+                 load_jit = True,
+                 load_onnx = False,
+                 fast = args.fast,
+                 fp16 = args.fp16,
+                 gpu = args.gpu,
+                 result_dir = args.result_dir)
+    model.inference(task = args.task,
+                text = args.text,
+                audio_prompt = args.audio_prompt,
+                chorus = args.chorus,
+                time_start = args.time_start,
+                time_end = args.time_end,
+                output_fn = args.output_fn,
+                max_audio_prompt_length = args.max_audio_prompt_length,
+                fade_out_duration = args.fade_out_duration,
+                output_format = args.format,
+                fade_out_mode = args.fade_out,
+                trim = args.trim)
+if __name__ == "__main__":
+    main()

inspiremusic/cli/inspiremusic.py ADDED Viewed

	@@ -0,0 +1,133 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import time
+from tqdm import tqdm
+from hyperpyyaml import load_hyperpyyaml
+from modelscope import snapshot_download
+from inspiremusic.cli.frontend import InspireMusicFrontEnd
+from inspiremusic.cli.model import InspireMusicModel
+from inspiremusic.utils.file_utils import logging
+import torch
+class InspireMusic:
+    def __init__(self, model_dir, load_jit=True, load_onnx=False, fast = False, fp16=True):
+        instruct = True if '-Instruct' in model_dir else False
+        self.model_dir = model_dir
+        if not os.path.exists(model_dir):
+            model_dir = snapshot_download(model_dir)
+        with open('{}/inspiremusic.yaml'.format(model_dir), 'r') as f:
+            configs = load_hyperpyyaml(f)
+        self.frontend = InspireMusicFrontEnd(configs,
+                                          configs['get_tokenizer'],
+                                          '{}/llm.pt'.format(model_dir),
+                                          '{}/flow.pt'.format(model_dir),
+                                          '{}/music_tokenizer/'.format(model_dir),
+                                          '{}/wavtokenizer/'.format(model_dir),
+                                          instruct,
+                                          fast,
+                                          fp16,
+                                          configs['allowed_special'])
+        self.model = InspireMusicModel(configs['llm'], configs['flow'], configs['hift'], configs['wavtokenizer'], fast, fp16)
+        self.model.load('{}/llm.pt'.format(model_dir),
+                        '{}/flow.pt'.format(model_dir),
+                        '{}/music_tokenizer/'.format(model_dir),
+                        '{}/wavtokenizer/model.pt'.format(model_dir))
+        del configs
+    @torch.inference_mode()
+    def inference(self, task, text, audio, time_start, time_end, chorus, stream=False, sr=24000):
+        if task == "text-to-music":
+            for i in tqdm(self.frontend.text_normalize(text, split=True)):
+                model_input = self.frontend.frontend_text_to_music(i, time_start, time_end, chorus)
+                start_time = time.time()
+                logging.info('prompt text {}'.format(i))
+                for model_output in self.model.inference(**model_input, stream=stream):
+                    music_audios_len = model_output['music_audio'].shape[1] / sr
+                    logging.info('yield music len {}, rtf {}'.format(music_audios_len, (time.time() - start_time) / music_audios_len))
+                    yield model_output
+                    start_time = time.time()
+        elif task == "continuation":
+            if text is None:
+                if audio is not None:
+                    for i in tqdm(audio):
+                        model_input = self.frontend.frontend_continuation(None, i, time_start, time_end, chorus, sr, max_audio_length)
+                        start_time = time.time()
+                        logging.info('prompt text {}'.format(i))
+                        for model_output in self.model.continuation_inference(**model_input, stream=stream):
+                            music_audios_len = model_output['music_audio'].shape[1] / sr
+                            logging.info('yield music len {}, rtf {}'.format(music_audios_len, (time.time() - start_time) / music_audios_len))
+                            yield model_output
+                            start_time = time.time()
+            else:
+                if audio is not None:
+                    for i in tqdm(self.frontend.text_normalize(text, split=True)):
+                        model_input = self.frontend.frontend_continuation(i, audio, time_start, time_end, chorus, sr, max_audio_length)
+                        start_time = time.time()
+                        logging.info('prompt text {}'.format(i))
+                        for model_output in self.model.continuation_inference(**model_input, stream=stream):
+                            music_audios_len = model_output['music_audio'].shape[1] / sr
+                            logging.info('yield music len {}, rtf {}'.format(music_audios_len, (time.time() - start_time) / music_audios_len))
+                            yield model_output
+                            start_time = time.time()
+                else:
+                    print("Please input text or audio.")
+        else:
+            print("Currently only support text-to-music and music continuation tasks.")
+    @torch.inference_mode()
+    def cli_inference(self, text, audio_prompt, time_start, time_end, chorus, task, stream=False, duration_to_gen=30, sr=24000):
+        if task == "text-to-music":
+            model_input = self.frontend.frontend_text_to_music(text, time_start, time_end, chorus)
+            logging.info('prompt text {}'.format(text))
+        elif task == "continuation":
+            model_input = self.frontend.frontend_continuation(text, audio_prompt, time_start, time_end, chorus, sr)
+            logging.info('prompt audio length: {}'.format(len(audio_prompt)))
+        start_time = time.time()
+        for model_output in self.model.inference(**model_input, duration_to_gen=duration_to_gen, task=task):
+            music_audios_len = model_output['music_audio'].shape[1] / sr
+            logging.info('yield music len {}, rtf {}'.format(music_audios_len, (time.time() - start_time) / music_audios_len))
+            yield model_output
+            start_time = time.time()
+    @torch.inference_mode()
+    def inference_zero_shot(self, text, prompt_text, prompt_audio_16k, stream=False, sr=24000):
+        prompt_text = self.frontend.text_normalize(prompt_text, split=False)
+        for i in tqdm(self.frontend.text_normalize(text, split=True)):
+            model_input = self.frontend.frontend_zero_shot(i, prompt_text, prompt_audio_16k)
+            start_time = time.time()
+            logging.info('prompt text {}'.format(i))
+            for model_output in self.model.inference(**model_input, stream=stream):
+                audio_len = model_output['music_audio'].shape[1] / sr
+                logging.info('yield audio len {}, rtf {}'.format(audio_len, (time.time() - start_time) / audio_len))
+                yield model_output
+                start_time = time.time()
+    @torch.inference_mode()
+    def inference_instruct(self, text, spk_id, instruct_text, stream=False, sr=24000):
+        if self.frontend.instruct is False:
+            raise ValueError('{} do not support instruct inference'.format(self.model_dir))
+        instruct_text = self.frontend.text_normalize(instruct_text, split=False)
+        for i in tqdm(self.frontend.text_normalize(text, split=True)):
+            model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text)
+            start_time = time.time()
+            logging.info('prompt text {}'.format(i))
+            for model_output in self.model.inference(**model_input, stream=stream):
+                audio_len = model_output['music_audio'].shape[1] / sr
+                logging.info('yield audio len {}, rtf {}'.format(audio_len, (time.time() - start_time) / audio_len))
+                yield model_output
+                start_time = time.time()

inspiremusic/cli/model.py ADDED Viewed

	@@ -0,0 +1,297 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import numpy as np
+import threading
+import time
+from contextlib import nullcontext
+import uuid
+from inspiremusic.utils.common import fade_in_out
+from inspiremusic.music_tokenizer.vqvae import VQVAE
+from inspiremusic.wavtokenizer.decoder.pretrained import WavTokenizer
+from torch.cuda.amp import autocast
+import logging
+import torch
+import os
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+class InspireMusicModel:
+    def __init__(self,
+                 llm: torch.nn.Module,
+                 flow: torch.nn.Module,
+                 music_tokenizer: torch.nn.Module,
+                 wavtokenizer: torch.nn.Module,
+                 fast: bool = False,
+                 fp16: bool = True,
+                 ):
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.llm = llm
+        self.flow = flow
+        self.music_tokenizer = music_tokenizer
+        self.wavtokenizer = wavtokenizer
+        self.fp16 = fp16
+        self.token_min_hop_len = 100
+        self.token_max_hop_len = 200
+        self.token_overlap_len = 20
+        # mel fade in out
+        self.mel_overlap_len = 34
+        self.mel_window = np.hamming(2 * self.mel_overlap_len)
+        # hift cache
+        self.mel_cache_len = 20
+        self.source_cache_len = int(self.mel_cache_len * 256)
+        # rtf and decoding related
+        self.stream_scale_factor = 1
+        assert self.stream_scale_factor >= 1, 'stream_scale_factor should be greater than 1, change it according to your actual rtf'
+        self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext()
+        self.lock = threading.Lock()
+        # dict used to store session related variable
+        self.music_token_dict = {}
+        self.llm_end_dict = {}
+        self.mel_overlap_dict = {}
+        self.fast = fast
+        self.generator = "hifi"
+    def load(self, llm_model, flow_model, hift_model, wavtokenizer_model):
+        if llm_model is not None:
+            self.llm.load_state_dict(torch.load(llm_model, map_location=self.device))
+            self.llm.to(self.device).eval()
+        else:
+            self.llm = None
+        if flow_model is not None:
+            self.flow.load_state_dict(torch.load(flow_model, map_location=self.device))
+            self.flow.to(self.device).eval()
+        if hift_model is not None:
+            if ".pt" not in hift_model:
+                self.music_tokenizer = VQVAE( hift_model + '/config.json',
+                                    hift_model + '/model.pt', with_encoder=True)
+            else:
+                self.music_tokenizer = VQVAE(os.path.dirname(hift_model) + '/config.json',
+                                    hift_model, with_encoder=True)
+            self.music_tokenizer.to(self.device).eval()
+        if wavtokenizer_model is not None:
+            if ".pt" not in wavtokenizer_model:
+                self.wavtokenizer = WavTokenizer.from_pretrained_feat( wavtokenizer_model + '/config.yaml',
+                                    wavtokenizer_model + '/model.pt')
+            else:
+                self.wavtokenizer = WavTokenizer.from_pretrained_feat( os.path.dirname(wavtokenizer_model) + '/config.yaml',
+                                    wavtokenizer_model )
+            self.wavtokenizer.to(self.device)
+    def load_jit(self, llm_text_encoder_model, llm_llm_model, flow_encoder_model):
+        assert self.fp16 is True, "we only provide fp16 jit model, set fp16=True if you want to use jit model"
+        llm_text_encoder = torch.jit.load(llm_text_encoder_model, map_location=self.device)
+        self.llm.text_encoder = llm_text_encoder
+        llm_llm = torch.jit.load(llm_llm_model)
+        self.llm.llm = llm_llm
+        flow_encoder = torch.jit.load(flow_encoder_model)
+        self.flow.encoder = flow_encoder
+    def load_onnx(self, flow_decoder_estimator_model):
+        import onnxruntime
+        option = onnxruntime.SessionOptions()
+        option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+        option.intra_op_num_threads = 1
+        providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
+        del self.flow.decoder.estimator
+        self.flow.decoder.estimator = onnxruntime.InferenceSession(flow_decoder_estimator_model, sess_options=option, providers=providers)
+    def llm_job(self, text, audio_token, audio_token_len, prompt_text, llm_prompt_audio_token, embeddings, uuid, duration_to_gen, task):
+        with self.llm_context:
+            local_res = []
+            with autocast(enabled=self.fp16):
+                inference_kwargs = {
+                    'text': text.to(self.device),
+                    'text_len': torch.tensor([text.shape[1]], dtype=torch.int32).to(self.device),
+                    'prompt_text': prompt_text.to(self.device),
+                    'prompt_text_len': torch.tensor([prompt_text.shape[1]], dtype=torch.int32).to(self.device),
+                    'prompt_audio_token': llm_prompt_audio_token.to(self.device),
+                    'prompt_audio_token_len': torch.tensor([llm_prompt_audio_token.shape[1]], dtype=torch.int32).to(self.device),
+                    'embeddings': embeddings,
+                    'duration_to_gen': duration_to_gen,
+                    'task': task
+                    }
+                if audio_token is not None:
+                    inference_kwargs['audio_token'] = audio_token.to(self.device)
+                else:
+                    inference_kwargs['audio_token'] = torch.Tensor([0]).to(self.device)
+                if audio_token_len is not None:
+                    inference_kwargs['audio_token_len'] = audio_token_len.to(self.device)
+                else:
+                    inference_kwargs['audio_token_len'] = torch.Tensor([0]).to(self.device)
+                for i in self.llm.inference(**inference_kwargs):
+                    local_res.append(i)
+            self.music_token_dict[uuid] = local_res
+        self.llm_end_dict[uuid] = True
+    # def token2wav(self, token, token_len, text, text_len, uuid, sample_rate, finalize=False):
+    def token2wav(self, token, token_len, uuid, sample_rate, finalize=False, flow_cfg=None):
+        # if self.flow is not None:
+        #     if isinstance(self.flow,MaskedDiffWithText):
+        #         codec_embed = self.flow.inference(token=token.to(self.device),
+        #                                         token_len=token_len.to(self.device),
+        #                                         text_token=text,
+        #                                         text_token_len=text_len,
+        #                                         )
+        #     else:
+        if flow_cfg is not None:
+            codec_embed = self.flow.inference_cfg(token=token.to(self.device),
+                                token_len=token_len.to(self.device),
+                                sample_rate=sample_rate
+                                )
+        else:
+            codec_embed = self.flow.inference(token=token.to(self.device),
+                                token_len=token_len.to(self.device),
+                                sample_rate=sample_rate
+                                )
+        # use music_tokenizer decoder
+        wav = self.music_tokenizer.generator(codec_embed)
+        wav = wav.squeeze(0).cpu().detach()
+        return wav
+    def acoustictoken2wav(self, token):
+        # use music_tokenizer to generate waveform from token
+        token = token.view(token.size(0), -1, 4)
+        # codec = token.view(1, -1, 4)
+        codec_embed = self.music_tokenizer.quantizer.embed(torch.tensor(token).long().to(self.device)).cuda()
+        wav = self.music_tokenizer.generator(codec_embed)
+        wav = wav.squeeze(0).cpu().detach()
+        return wav
+    def semantictoken2wav(self, token):
+        # fast mode, use wavtokenizer decoder
+        new_tensor = torch.tensor(token.to(self.device)).unsqueeze(0)
+        features = self.wavtokenizer.codes_to_features(new_tensor)
+        bandwidth_id = torch.tensor([0]).to(self.device)
+        wav = self.wavtokenizer.to(self.device).decode(features, bandwidth_id=bandwidth_id)
+        wav = wav.cpu().detach()
+        return wav
+    @torch.inference_mode()
+    def inference(self, text, audio_token, audio_token_len, text_token, text_token_len, embeddings=None,
+                  prompt_text=torch.zeros(1, 0, dtype=torch.int32),
+                  llm_prompt_audio_token=torch.zeros(1, 0, dtype=torch.int32),
+                  flow_prompt_audio_token=torch.zeros(1, 0, dtype=torch.int32),
+                  prompt_audio_feat=torch.zeros(1, 0, 80), sample_rate=48000, duration_to_gen = 30, task="continuation", trim = True, stream=False, **kwargs):
+        # this_uuid is used to track variables related to this inference thread
+        # support tasks:
+        # text to music task
+        # music continuation task
+        # require either audio input only or text and audio inputs
+        this_uuid = str(uuid.uuid1())
+        if self.llm:
+            with self.lock:
+                self.music_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False
+            p = threading.Thread(target=self.llm_job, args=(text_token, audio_token, audio_token_len, prompt_text, llm_prompt_audio_token, embeddings, this_uuid, duration_to_gen, task))
+            p.start()
+        if stream is True:
+            token_hop_len = self.token_min_hop_len
+            while True:
+                time.sleep(0.1)
+                if len(self.music_token_dict[this_uuid]) >= token_hop_len + self.token_overlap_len:
+                    this_music_audio = self.token2wav(token=text_token,
+                                                     token_len=text_token_len,
+                                                        uuid=this_uuid,
+                                                        sample_rate=sample_rate,
+                                                        finalize=False)
+                    yield {'music_audio': this_music_audio.cpu()}
+                    with self.lock:
+                        self.music_token_dict[this_uuid] = self.music_token_dict[this_uuid][token_hop_len:]
+                    # increase token_hop_len for better audio quality
+                    token_hop_len = min(self.token_max_hop_len, int(token_hop_len * self.stream_scale_factor))
+                if self.llm_end_dict[this_uuid] is True and len(self.music_token_dict[this_uuid]) < token_hop_len + self.token_overlap_len:
+                    break
+            p.join()
+            # deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
+            this_music_token = torch.concat(self.music_token_dict[this_uuid], dim=1)
+            with self.flow_hift_context:
+                this_music_audio = self.token2wav(token=this_music_token,
+                                                 prompt_token=flow_prompt_audio_token,
+                                                 prompt_feat=prompt_audio_feat,
+                                                 embedding=flow_embedding,
+                                                 uuid=this_uuid,
+                                                 sample_rate=sample_rate,
+                                                 finalize=True)
+            yield {'music_audio': this_music_audio.cpu()}
+        else:
+            # deal with all tokens
+            if self.fast:
+                if task == "reconstruct":
+                    assert audio_token is None
+                    this_music_token = audio_token
+                    this_music_audio = self.acoustictoken2wav(token=this_music_token)
+                else:
+                    if self.llm:
+                        p.join()
+                        print(len(self.music_token_dict[this_uuid]))
+                        this_music_token = torch.concat(self.music_token_dict[this_uuid], dim=1)
+                        print(this_music_token.shape)
+                    else:
+                        this_music_token = text_token
+                    logging.info("using wavtokenizer generator without flow matching")
+                    this_music_audio = self.semantictoken2wav(token=this_music_token)
+                    print(this_music_audio.shape)
+            else:
+                if self.llm:
+                    p.join()
+                    if len(self.music_token_dict[this_uuid]) != 0:
+                        this_music_token = torch.concat(self.music_token_dict[this_uuid], dim=1)
+                    else:
+                        print(f"The list of tensors is empty for UUID: {this_uuid}")
+                else:
+                    this_music_token = text_token
+                logging.info(f"LLM generated audio token length: {this_music_token.shape[1]}")
+                logging.info(f"using flow matching and {self.generator} generator")
+                if self.generator == "hifi":
+                    if (embeddings[1] - embeddings[0]) <= duration_to_gen:
+                        if trim:
+                            trim_length = (int((embeddings[1] - embeddings[0])*75))
+                            this_music_token = this_music_token[:, :trim_length]
+                            logging.info(f"After trimmed, generated audio token length: {this_music_token.shape[1]}")
+                    elif (embeddings[1] - embeddings[0]) < 1:
+                        logging.info(f"Given audio length={(embeddings[1] - embeddings[0])}, which is too short, please give a longer audio length.")
+                    this_music_audio = self.token2wav(token=this_music_token,
+                                                token_len=torch.LongTensor([this_music_token.size(1)]),
+                                                uuid=this_uuid,
+                                                sample_rate=sample_rate,
+                                                finalize=True)
+                    logging.info(f"Generated audio sequence length: {this_music_audio.shape[1]}")
+                elif self.generator == "wavtokenizer":
+                    if (embeddings[1] - embeddings[0]) < duration_to_gen:
+                        if trim:
+                            trim_length = (int((embeddings[1] - embeddings[0])*75))
+                            this_music_token = this_music_token[:,:trim_length]
+                            logging.info(f"After trimmed, generated audio token length: {this_music_token.shape[1]}")
+                    elif (embeddings[1] - embeddings[0]) < 1:
+                        logging.info(f"Given audio length={(embeddings[1] - embeddings[0])}, which is too short, please give a longer audio length.")
+                    this_music_audio = self.semantictoken2wav(token=this_music_token)
+            yield {'music_audio': this_music_audio.cpu()}
+            torch.cuda.synchronize()

inspiremusic/dataset/__init__.py ADDED Viewed

File without changes

inspiremusic/dataset/dataset.py ADDED Viewed

	@@ -0,0 +1,154 @@

+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#               2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+import json
+import math
+from functools import partial
+import torch
+import torch.distributed as dist
+from torch.utils.data import IterableDataset
+from inspiremusic.utils.file_utils import read_lists, read_json_lists
+class Processor(IterableDataset):
+    def __init__(self, source, f, *args, **kw):
+        assert callable(f)
+        self.source = source
+        self.f = f
+        self.args = args
+        self.kw = kw
+    def set_epoch(self, epoch):
+        self.source.set_epoch(epoch)
+    def __iter__(self):
+        """ Return an iterator over the source dataset processed by the
+            given processor.
+        """
+        assert self.source is not None
+        assert callable(self.f)
+        return self.f(iter(self.source), *self.args, **self.kw)
+    def apply(self, f):
+        assert callable(f)
+        return Processor(self, f, *self.args, **self.kw)
+class DistributedSampler:
+    def __init__(self, shuffle=True, partition=True):
+        self.epoch = -1
+        self.update()
+        self.shuffle = shuffle
+        self.partition = partition
+    def update(self):
+        assert dist.is_available()
+        if dist.is_initialized():
+            self.rank = dist.get_rank()
+            self.world_size = dist.get_world_size()
+        else:
+            self.rank = 0
+            self.world_size = 1
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is None:
+            self.worker_id = 0
+            self.num_workers = 1
+        else:
+            self.worker_id = worker_info.id
+            self.num_workers = worker_info.num_workers
+        return dict(rank=self.rank,
+                    world_size=self.world_size,
+                    worker_id=self.worker_id,
+                    num_workers=self.num_workers)
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+    def sample(self, data):
+        """ Sample data according to rank/world_size/num_workers
+            Args:
+                data(List): input data list
+            Returns:
+                List: data list after sample
+        """
+        data = list(range(len(data)))
+        # force datalist even
+        if self.partition:
+            if self.shuffle:
+                random.Random(self.epoch).shuffle(data)
+            if len(data) < self.world_size:
+                print(len(data), self.world_size)
+                data = data * math.ceil(self.world_size / len(data))
+                data = data[:self.world_size]
+            data = data[self.rank::self.world_size]
+        if len(data) < self.num_workers:
+            data = data * math.ceil(self.num_workers / len(data))
+            data = data[:self.num_workers]
+        data = data[self.worker_id::self.num_workers]
+        return data
+class DataList(IterableDataset):
+    def __init__(self, lists, shuffle=True, partition=True):
+        self.lists = lists
+        self.sampler = DistributedSampler(shuffle, partition)
+    def set_epoch(self, epoch):
+        self.sampler.set_epoch(epoch)
+    def __iter__(self):
+        sampler_info = self.sampler.update()
+        indexes = self.sampler.sample(self.lists)
+        for index in indexes:
+            data = dict(src=self.lists[index])
+            data.update(sampler_info)
+            yield data
+def Dataset(data_list_file,
+            data_pipeline,
+            mode='train',
+            shuffle=True,
+            partition=True
+            ):
+    """ Construct dataset from arguments
+        We have two shuffle stage in the Dataset. The first is global
+        shuffle at shards tar/raw file level. The second is global shuffle
+        at training samples level.
+        Args:
+            data_type(str): raw/shard
+            tokenizer (BaseTokenizer): tokenizer to tokenize
+            partition(bool): whether to do data partition in terms of rank
+    """
+    assert mode in ['train', 'inference', 'processing']
+    lists = read_lists(data_list_file)
+    dataset = DataList(lists,
+                       shuffle=shuffle,
+                       partition=partition)
+    for func in data_pipeline:
+        dataset = Processor(dataset, func, mode=mode)
+    return dataset

inspiremusic/dataset/processor.py ADDED Viewed

	@@ -0,0 +1,595 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import random
+import pyarrow.parquet as pq
+import torch
+import torchaudio
+from torch.nn.utils.rnn import pad_sequence
+import torch.nn.functional as F
+import numpy as np
+import re
+torchaudio.set_audio_backend('soundfile')
+AUDIO_FORMAT_SETS = {'flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'}
+CHORUS = {"intro": 0, "chorus": 1, "verse1": 2, "verse2": 3, "verse": 2,
+		  "outro": 4}
+metadata_pattern = re.compile(r'^\[(ti|ar|al|by|offset):.*\]$')
+timestamp_pattern = re.compile(r'^\[\d{2}:\d{2}\.\d{2}\](.*)$')
+def parquet_opener(data, mode='train', audio_data={}):
+	""" Give url or local file, return file descriptor
+        Inplace operation.
+        Args:
+            data(Iterable[str]): url or local file list
+        Returns:
+            Iterable[{src, stream}]
+    """
+	for sample in data:
+		assert 'src' in sample
+		url = sample['src']
+		try:
+			df = pq.read_table(url).to_pandas()
+			for i in df.index:
+				sample.update(dict(df.loc[i]))
+				yield {**sample}
+		except Exception as ex:
+			logging.warning('Failed to open {}, ex info {}'.format(url, ex))
+def clean_lyrics(data, mode="train"):
+	for sample in data:
+		lyrics = sample["text"]
+		cleaned = []
+		for line in lyrics.splitlines():
+			if metadata_pattern.match(line):
+				continue
+			timestamp_match = timestamp_pattern.match(line)
+			if timestamp_match:
+				lyric = timestamp_match.group(1).strip()
+				if lyric:
+					cleaned.append(lyric)
+			else:
+				if line.strip():
+					cleaned.append(line.strip())
+		sample["text"] = '\n'.join(cleaned)
+		yield sample
+def cut_by_length(data, max_length=8000, num_times=4, mode="train"):
+	for sample in data:
+		if "semantic_token" in sample:
+			sample["semantic_token"] = [
+				sample["semantic_token"][0][:max_length]]
+		if "acoustic_token" not in sample:
+			sample["acoustic_token"] = sample["speech_token"]
+		sample["acoustic_token"] = sample["acoustic_token"][
+								   :max_length * num_times]
+		yield sample
+def filter(data,
+           max_length=22500,  # 22500 #5min #10240
+           max_acoustic_length=45000,
+           min_length=10,
+           min_acoustic_length=150,
+           token_max_length=200,
+           token_min_length=1,
+           min_output_input_ratio=0.0005,
+           max_output_input_ratio=1,
+           mode='train'):
+	""" Filter sample according to feature and label length
+        Inplace operation.
+        Args::
+            data: Iterable[{key, wav, label, sample_rate}]
+            max_length: drop utterance which is greater than max_length(10ms)
+            min_length: drop utterance which is less than min_length(10ms)
+            token_max_length: drop utterance which is greater than
+                token_max_length, especially when use char unit for
+                english modeling
+            token_min_length: drop utterance which is
+                less than token_max_length
+            min_output_input_ratio: minimal ration of
+                token_length / feats_length(10ms)
+            max_output_input_ratio: maximum ration of
+                token_length / feats_length(10ms)
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+	if mode == "train":
+		for sample in data:
+			if "semantic_token" in sample:
+				new_sample_frames = sample['semantic_token'][0].shape[0]
+			else:
+				new_sample_frames = sample['speech_token']
+			if "text_token" in sample:
+				new_sample_frames += len(sample['text_token'])
+			if new_sample_frames > max_length or new_sample_frames < min_length:
+				print(f"skipped 1 item length={new_sample_frames}")
+				continue
+			sample["chorus"] = sample["chorus"].split(",")
+			if not isinstance(sample["time_start"], np.ndarray):
+				sample["time_start"] = [sample["time_start"]]
+				sample["time_end"] = [sample["time_end"]]
+			for i, t in enumerate(sample["chorus"]):
+				if sample["chorus"][i] == "verse":
+					sample["chorus"][i] = "verse1"
+			yield sample
+	if mode == "train_flow":
+		for sample in data:
+			if "semantic_token" in sample:
+				new_sample_frames = sample['semantic_token'][0].shape[0]
+			if "acoustic_token" in sample:
+				target_sample_frames = sample['acoustic_token'][0].shape[0]
+			if new_sample_frames > max_length or new_sample_frames < min_acoustic_length or new_sample_frames < min_length or target_sample_frames > max_acoustic_length:
+				print(
+					f"skipped 1 item length={new_sample_frames}, target_length={target_sample_frames}")
+				continue
+			yield sample
+	elif mode == "inference":
+		for sample in data:
+			yield sample
+def resample(data, resample_rate=22050, min_sample_rate=16000, mode='train'):
+	""" Resample data.
+        Inplace operation.
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+            resample_rate: target resample rate
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+	for sample in data:
+		assert 'sample_rate' in sample
+		assert 'speech' in sample
+		sample_rate = sample['sample_rate']
+		waveform = sample['speech']
+		if sample_rate != resample_rate:
+			if sample_rate < min_sample_rate:
+				continue
+			sample['sample_rate'] = resample_rate
+			sample['speech'] = torchaudio.transforms.Resample(
+					orig_freq=sample_rate, new_freq=resample_rate)(waveform)
+		max_val = sample['speech'].abs().max()
+		if max_val > 1:
+			sample['speech'] /= max_val
+		yield sample
+def truncate(data, truncate_length=24576, mode='train'):
+	""" Truncate data.
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+            truncate_length: truncate length
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+	for sample in data:
+		waveform = sample['audio']
+		if waveform.shape[1] > truncate_length:
+			start = random.randint(0, waveform.shape[1] - truncate_length)
+			waveform = waveform[:, start: start + truncate_length]
+		else:
+			waveform = torch.concat([waveform, torch.zeros(1, truncate_length -
+														   waveform.shape[1])],
+									dim=1)
+		sample['audio'] = waveform
+		yield sample
+def upsample(data, resample_rate=48000, min_sample_rate=16000, mode='train',
+			 n_codebook=4):
+	""" Resample data.
+        Inplace operation.
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+            resample_rate: target resample rate
+        Returns:
+            Iterable[{key, wav, label, sample_rate}]
+    """
+	for sample in data:
+		assert 'semantic_token' in sample
+		# TODO: unify data processing key names
+		if 'acoustic_token' not in sample:
+			continue
+		if 'sample_rate' in sample.keys():
+			sample_rate = sample['sample_rate']
+		else:
+			sample_rate = 24000
+		token = np.array(sample['semantic_token'][0][:-1])
+		# Calculate the repetition factor for resampling
+		repetition_factor = int(n_codebook * resample_rate / sample_rate)
+		if sample_rate != resample_rate:
+			if sample_rate < min_sample_rate:
+				continue
+			sample['sample_rate'] = resample_rate
+			sample['semantic_token'] = np.array(
+					[np.repeat(token, repetition_factor)])
+		yield sample
+def compute_fbank(data,
+				  feat_extractor,
+				  mode='train'):
+	""" Extract fbank
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+	for sample in data:
+		assert 'sample_rate' in sample
+		assert 'speech' in sample
+		assert 'utt' in sample
+		assert 'text_token' in sample
+		waveform = sample['speech']
+		mat = feat_extractor(waveform).squeeze(dim=0).transpose(0, 1)
+		sample['speech_feat'] = mat
+		del sample['speech']
+		yield sample
+def parse_embedding(data, normalize, mode='train'):
+	""" Parse utt_embedding/spk_embedding
+        Args:
+            data: Iterable[{key, wav, label, sample_rate}]
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+	for sample in data:
+		sample['utt_embedding'] = torch.tensor(sample['utt_embedding'],
+											   dtype=torch.float32)
+		sample['spk_embedding'] = torch.tensor(sample['spk_embedding'],
+											   dtype=torch.float32)
+		if normalize:
+			sample['utt_embedding'] = F.normalize(sample['utt_embedding'],
+												  dim=0)
+			sample['spk_embedding'] = F.normalize(sample['spk_embedding'],
+												  dim=0)
+		yield sample
+def tokenize(data, get_tokenizer, allowed_special, mode='train'):
+	""" Decode text to chars or BPE
+        Inplace operation
+        Args:
+            data: Iterable[{key, wav, txt, sample_rate}]
+        Returns:
+            Iterable[{key, wav, txt, tokens, label, sample_rate}]
+    """
+	tokenizer = get_tokenizer()
+	for sample in data:
+		assert 'text' in sample
+		sample['text_token'] = tokenizer.encode(sample['text'],
+												allowed_special=allowed_special)
+		yield sample
+def shuffle(data, shuffle_size=10000, mode='train'):
+	""" Local shuffle the data
+        Args:
+            data: Iterable[{key, feat, label}]
+            shuffle_size: buffer size for shuffle
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+	buf = []
+	for sample in data:
+		buf.append(sample)
+		if len(buf) >= shuffle_size:
+			random.shuffle(buf)
+			for x in buf:
+				yield x
+			buf = []
+	# The sample left over
+	random.shuffle(buf)
+	for x in buf:
+		yield x
+def sort(data, sort_size=500, mode='train'):
+	""" Sort the data by feature length.
+        Sort is used after shuffle and before batch, so we can group
+        utts with similar lengths into a batch, and `sort_size` should
+        be less than `shuffle_size`
+        Args:
+            data: Iterable[{key, feat, label}]
+            sort_size: buffer size for sort
+        Returns:
+            Iterable[{key, feat, label}]
+    """
+	buf = []
+	for sample in data:
+		if sample["chorus"] == "verse":
+			sample["chorus"] = "verse1"
+		if sample["acoustic_token"].shape[0] == 1:
+			sample["acoustic_token"] = np.concatenate(
+					sample["acoustic_token"][0])
+		else:
+			sample["acoustic_token"] = np.concatenate(sample["acoustic_token"])
+		sample["acoustic_token"] = torch.from_numpy(sample["acoustic_token"])
+		buf.append(sample)
+		if len(buf) >= sort_size:
+			buf.sort(key=lambda x: x['acoustic_token'].size(0))
+			for x in buf:
+				yield x
+			buf = []
+	# The sample left over
+	buf.sort(key=lambda x: x['acoustic_token'].size(0))
+	for x in buf:
+		yield x
+def static_batch(data, batch_size=32):
+	""" Static batch the data by `batch_size`
+        Args:
+            data: Iterable[{key, feat, label}]
+            batch_size: batch size
+        Returns:
+            Iterable[List[{key, feat, label}]]
+    """
+	buf = []
+	data_empty = True
+	for sample in data:
+		data_empty = False
+		buf.append(sample)
+		if len(buf) >= batch_size:
+			yield buf
+			buf = []
+	if data_empty:
+		raise ValueError("data is empty")
+	if len(buf) > 0:
+		yield buf
+def dynamic_batch(data, max_frames_in_batch=12000, mode='train'):
+	""" Dynamic batch the data until the total frames in batch
+        reach `max_frames_in_batch`
+        Args:
+            data: Iterable[{key, feat, label}]
+            max_frames_in_batch: max_frames in one batch
+        Returns:
+            Iterable[List[{key, feat, label}]]
+    """
+	buf = []
+	longest_frames = 0
+	for sample in data:
+		assert 'acoustic_token' in sample
+		assert isinstance(sample['acoustic_token'], torch.Tensor)
+		if 'semantic_token' in sample:
+			new_sample_frames = sample['semantic_token'][0].shape[0]
+		else:
+			new_sample_frames = sample['semantic_token']
+		if "text_token" in sample:
+			new_sample_frames += len(sample['text_token'])
+		longest_frames = max(longest_frames, new_sample_frames)
+		frames_after_padding = longest_frames * (len(buf) + 1)
+		if frames_after_padding > max_frames_in_batch:
+			if len(buf) > 0:
+				yield buf
+			buf = [sample]
+			longest_frames = new_sample_frames
+		else:
+			buf.append(sample)
+	if len(buf) > 0:
+		yield buf
+def batch(data, batch_type='static', batch_size=16, max_frames_in_batch=12000,
+		  mode='train'):
+	""" Wrapper for static/dynamic batch
+    """
+	if mode == 'inference':
+		return static_batch(data, 1)
+	elif mode == 'processing':
+		return static_batch(data, batch_size)
+	else:
+		if batch_type == 'static':
+			return static_batch(data, batch_size)
+		elif batch_type == 'dynamic':
+			return dynamic_batch(data, max_frames_in_batch)
+		else:
+			logging.fatal('Unsupported batch type {}'.format(batch_type))
+def padding(data, mode='train'):
+	""" Padding the data into training data
+        Args:
+            data: Iterable[List[{key, feat, label}]]
+        Returns:
+            Iterable[Tuple(keys, feats, labels, feats lengths, label lengths)]
+    """
+	if mode == "train":
+		for sample in data:
+			assert isinstance(sample, list)
+			if len(sample) != 0:
+				acoustic_feat_len = torch.tensor(
+						[x['acoustic_token'].size(0) for x in sample],
+						dtype=torch.int32)
+				order = torch.argsort(acoustic_feat_len, descending=True)
+				utts = [sample[i]['utt'] for i in order]
+				acoustic_token = [
+					sample[i]['acoustic_token'].clone().to(torch.int32) for i in
+					order]
+				acoustic_token_len = torch.tensor(
+						[i.size(0) for i in acoustic_token], dtype=torch.int32)
+				acoustic_token = pad_sequence(acoustic_token,
+											  batch_first=True,
+											  padding_value=0)
+				text = [sample[i]['text'] for i in order]
+				text_token = [torch.tensor(sample[i]['text_token']).long() for i
+							  in order]
+				text_token_len = torch.tensor([i.size(0) for i in text_token],
+											  dtype=torch.int32)
+				text_token = pad_sequence(text_token, batch_first=True,
+										  padding_value=0)
+				time_start = torch.tensor(
+						[sample[i]['time_start'] for i in order])
+				time_end = torch.tensor([sample[i]['time_end'] for i in order])
+				if isinstance(sample[0]['chorus'], str):
+					chorus = torch.tensor(
+							[CHORUS[sample[i]['chorus']] for i in order])
+				else:
+					chorus = [
+						torch.tensor([CHORUS[t] for t in sample[i]['chorus']])
+						for i in order]
+					chorus = pad_sequence(chorus, batch_first=True,
+										  padding_value=-1)
+				batch = {
+					"utts"              : utts,
+					"acoustic_token"    : acoustic_token,
+					"acoustic_token_len": acoustic_token_len,
+					"time_start"        : time_start,
+					"time_end"          : time_end,
+					"chorus"            : chorus,
+					"text"              : text,
+					"text_token"        : text_token,
+					"text_token_len"    : text_token_len,
+				}
+				if "semantic_token" in sample[0]:
+					semantic_token = [
+						torch.tensor(sample[i]['semantic_token'][0],
+									 dtype=torch.int32) for i in order]
+					semantic_token_len = torch.tensor(
+							[i.size(0) for i in semantic_token],
+							dtype=torch.int32)
+					semantic_token = pad_sequence(semantic_token,
+												  batch_first=True,
+												  padding_value=0)
+					batch.update({"semantic_token"    : semantic_token,
+								  "semantic_token_len": semantic_token_len})
+				yield batch
+			else:
+				logging.info("WARNING: sample is empty []!")
+	elif mode == "inference":
+		for sample in data:
+			assert isinstance(sample, list)
+			utts = [sample[i]['utt'] for i in range(len(sample))]
+			text = [sample[i]['text'] for i in range(len(sample))]
+			text_token = [torch.tensor(sample[i]['text_token']).long() for i in
+						  range(len(sample))]
+			text_token_len = torch.tensor([i.size(0) for i in text_token],
+										  dtype=torch.int32)
+			text_token = pad_sequence(text_token, batch_first=True,
+									  padding_value=0)
+			time_start = torch.tensor(
+					[sample[i]['time_start'] for i in range(len(sample))])
+			time_end = torch.tensor(
+					[sample[i]['time_end'] for i in range(len(sample))])
+			if isinstance(sample[0]['chorus'], str):
+				chorus = torch.tensor([CHORUS[sample[i]['chorus']] for i in
+									   range(len(sample))])
+			else:
+				chorus = [torch.tensor([CHORUS[t] for t in sample[i]['chorus']])
+						  for i in range(len(sample))]
+				chorus = pad_sequence(chorus, batch_first=True,
+									  padding_value=-1)
+			if "acoustic_token" in sample[0]:
+				acoustic_token = [
+					sample[i]['acoustic_token'].clone().to(torch.int32) for i in
+					range(len(sample))]
+				acoustic_token_len = torch.tensor(
+						[i.size(0) for i in acoustic_token], dtype=torch.int32)
+				acoustic_token = pad_sequence(acoustic_token,
+											  batch_first=True,
+											  padding_value=0)
+			else:
+				acoustic_token = None
+				acoustic_token_len = None
+			batch = {
+				"utts"              : utts,
+				"acoustic_token"    : acoustic_token,
+				"acoustic_token_len": acoustic_token_len,
+				"time_start"        : time_start,
+				"time_end"          : time_end,
+				"chorus"            : chorus,
+				"text"              : text,
+				"text_token"        : text_token,
+				"text_token_len"    : text_token_len,
+			}
+			if "semantic_token" in sample[0]:
+				semantic_token = [torch.tensor(sample[i]['semantic_token'][0],
+											   dtype=torch.int32) for i in
+								  range(len(sample))]
+				semantic_token_len = torch.tensor(
+						[i.size(0) for i in semantic_token], dtype=torch.int32)
+				semantic_token = pad_sequence(semantic_token,
+											  batch_first=True,
+											  padding_value=0)
+				batch.update({"semantic_token"    : semantic_token,
+							  "semantic_token_len": semantic_token_len})
+			yield batch

inspiremusic/flow/decoder.py ADDED Viewed

	@@ -0,0 +1,277 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from einops import pack, rearrange, repeat
+from matcha.models.components.decoder import SinusoidalPosEmb, Block1D, ResnetBlock1D, Downsample1D, TimestepEmbedding, Upsample1D
+from matcha.models.components.transformer import BasicTransformerBlock
+class Transpose(torch.nn.Module):
+    def __init__(self, dim0: int, dim1: int):
+        super().__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+    def forward(self, x: torch.Tensor):
+        x = torch.transpose(x, self.dim0, self.dim1)
+        return x
+class CausalBlock1D(Block1D):
+    def __init__(self, dim: int, dim_out: int):
+        super(CausalBlock1D, self).__init__(dim, dim_out)
+        self.block = torch.nn.Sequential(
+            CausalConv1d(dim, dim_out, 3),
+            Transpose(1, 2),
+            nn.LayerNorm(dim_out),
+            Transpose(1, 2),
+            nn.Mish(),
+        )
+    def forward(self, x: torch.Tensor, mask: torch.Tensor):
+        output = self.block(x * mask)
+        return output * mask
+class CausalResnetBlock1D(ResnetBlock1D):
+    def __init__(self, dim: int, dim_out: int, time_emb_dim: int, groups: int = 8):
+        super(CausalResnetBlock1D, self).__init__(dim, dim_out, time_emb_dim, groups)
+        self.block1 = CausalBlock1D(dim, dim_out)
+        self.block2 = CausalBlock1D(dim_out, dim_out)
+class CausalConv1d(torch.nn.Conv1d):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = 'zeros',
+        device=None,
+        dtype=None
+    ) -> None:
+        super(CausalConv1d, self).__init__(in_channels, out_channels,
+                                           kernel_size, stride,
+                                           padding=0, dilation=dilation,
+                                           groups=groups, bias=bias,
+                                           padding_mode=padding_mode,
+                                           device=device, dtype=dtype)
+        assert stride == 1
+        self.causal_padding = (kernel_size - 1, 0)
+    def forward(self, x: torch.Tensor):
+        x = F.pad(x, self.causal_padding)
+        x = super(CausalConv1d, self).forward(x)
+        return x
+class ConditionalDecoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        channels=(256, 256),
+        dropout=0.05,
+        attention_head_dim=64,
+        n_blocks=1,
+        num_mid_blocks=2,
+        num_heads=4,
+        act_fn="snake",
+    ):
+        """
+        This decoder requires an input with the same shape of the target. So, if your text content
+        is shorter or longer than the outputs, please re-sampling it before feeding to the decoder.
+        """
+        super().__init__()
+        channels = tuple(channels)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.time_embeddings = SinusoidalPosEmb(in_channels)
+        time_embed_dim = channels[0] * 4
+        self.time_mlp = TimestepEmbedding(
+            in_channels=in_channels,
+            time_embed_dim=time_embed_dim,
+            act_fn="silu",
+        )
+        self.down_blocks = nn.ModuleList([])
+        self.mid_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        output_channel = in_channels
+        for i in range(len(channels)):  # pylint: disable=consider-using-enumerate
+            input_channel = output_channel
+            output_channel = channels[i]
+            is_last = i == len(channels) - 1
+            resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            downsample = (
+                Downsample1D(output_channel) if not is_last else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+            )
+            self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))
+        for _ in range(num_mid_blocks):
+            input_channel = channels[-1]
+            out_channels = channels[-1]
+            resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))
+        channels = channels[::-1] + (channels[0],)
+        for i in range(len(channels) - 1):
+            input_channel = channels[i] * 2
+            output_channel = channels[i + 1]
+            is_last = i == len(channels) - 2
+            resnet = ResnetBlock1D(
+                dim=input_channel,
+                dim_out=output_channel,
+                time_emb_dim=time_embed_dim,
+            )
+            transformer_blocks = nn.ModuleList(
+                [
+                    BasicTransformerBlock(
+                        dim=output_channel,
+                        num_attention_heads=num_heads,
+                        attention_head_dim=attention_head_dim,
+                        dropout=dropout,
+                        activation_fn=act_fn,
+                    )
+                    for _ in range(n_blocks)
+                ]
+            )
+            upsample = (
+                Upsample1D(output_channel, use_conv_transpose=True)
+                if not is_last
+                else nn.Conv1d(output_channel, output_channel, 3, padding=1)
+            )
+            self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
+        self.final_block = Block1D(channels[-1], channels[-1])
+        self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
+        self.initialize_weights()
+    def initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv1d):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.GroupNorm):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def forward(self, x, mask, mu, t, spks=None, cond=None):
+        """Forward pass of the UNet1DConditional model.
+        Args:
+            x (torch.Tensor): shape (batch_size, in_channels, time)
+            mask (_type_): shape (batch_size, 1, time)
+            t (_type_): shape (batch_size)
+            spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
+            cond (_type_, optional): placeholder for future use. Defaults to None.
+        Raises:
+            ValueError: _description_
+            ValueError: _description_
+        Returns:
+            _type_: _description_
+        """
+        t = self.time_embeddings(t).to(t.dtype)
+        t = self.time_mlp(t)
+        x = pack([x, mu], "b * t")[0]
+        if spks is not None:
+            spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
+            x = pack([x, spks], "b * t")[0]
+        if cond is not None:
+            x = pack([x, cond], "b * t")[0]
+        hiddens = []
+        masks = [mask]
+        for resnet, transformer_blocks, downsample in self.down_blocks:
+            mask_down = masks[-1]
+            x = resnet(x, mask_down, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            attn_mask = torch.matmul(mask_down.transpose(1, 2).contiguous(), mask_down)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+            hiddens.append(x)  # Save hidden states for skip connections
+            x = downsample(x * mask_down)
+            masks.append(mask_down[:, :, ::2])
+        masks = masks[:-1]
+        mask_mid = masks[-1]
+        for resnet, transformer_blocks in self.mid_blocks:
+            x = resnet(x, mask_mid, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            attn_mask = torch.matmul(mask_mid.transpose(1, 2).contiguous(), mask_mid)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+        for resnet, transformer_blocks, upsample in self.up_blocks:
+            mask_up = masks.pop()
+            skip = hiddens.pop()
+            x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
+            x = resnet(x, mask_up, t)
+            x = rearrange(x, "b c t -> b t c").contiguous()
+            attn_mask = torch.matmul(mask_up.transpose(1, 2).contiguous(), mask_up)
+            for transformer_block in transformer_blocks:
+                x = transformer_block(
+                    hidden_states=x,
+                    attention_mask=attn_mask,
+                    timestep=t,
+                )
+            x = rearrange(x, "b t c -> b c t").contiguous()
+            x = upsample(x * mask_up)
+        x = self.final_block(x, mask_up)
+        output = self.final_proj(x * mask_up)
+        return output * mask

inspiremusic/flow/flow.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import random
+from typing import Dict, Optional
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from omegaconf import DictConfig
+from inspiremusic.utils.mask import make_pad_mask
+from inspiremusic.music_tokenizer.vqvae import VQVAE
+class MaskedDiff(torch.nn.Module):
+    def __init__(self,
+                 input_size: int = 512,
+                 output_size: int = 128,
+                 output_type: str = "mel",
+                 vocab_size: int = 4096,
+                 input_frame_rate: int = 50,
+                 only_mask_loss: bool = True,
+                 encoder: torch.nn.Module = None,
+                 length_regulator: torch.nn.Module = None,
+                 decoder: torch.nn.Module = None,
+                 decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80,
+                                       'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine',
+                                                                 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}),
+                                       'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64,
+                                                          'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}},
+                 mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 128, 'sampling_rate': 48000,
+                                        'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 48000},
+                generator_model_dir: str = "../../pretrained_models/InspireMusic-Base/music_tokenizer",
+                num_codebooks: int = 4
+                ):
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+        self.decoder_conf = decoder_conf
+        self.mel_feat_conf = mel_feat_conf
+        self.vocab_size = vocab_size
+        self.output_type = output_type
+        self.input_frame_rate = input_frame_rate
+        logging.info(f"input frame rate={self.input_frame_rate}")
+        self.input_embedding = nn.Embedding(vocab_size, input_size)
+        self.encoder = encoder
+        self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
+        self.decoder = decoder
+        self.length_regulator = length_regulator
+        self.only_mask_loss = only_mask_loss
+        self.quantizer = VQVAE( f'{generator_model_dir}/config.json',
+                                  f'{generator_model_dir}/model.pt',with_encoder=True).quantizer
+        self.quantizer.eval()
+        self.num_codebooks  = num_codebooks
+        self.cond = None
+        self.interpolate = False
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        audio_token = batch['acoustic_token'].to(device)
+        audio_token_len = batch['acoustic_token_len'].to(device)
+        audio_token  = audio_token.view(audio_token.size(0),-1,self.num_codebooks)
+        if "semantic_token" not in batch:
+            token = audio_token[:,:,0]
+            token_len = (audio_token_len/self.num_codebooks).long()
+        else:
+            token = batch['semantic_token'].to(device)
+            token_len = batch['semantic_token_len'].to(device)
+        with torch.no_grad():
+            feat = self.quantizer.embed(audio_token)
+            feat_len = (audio_token_len/self.num_codebooks).long()
+        token = self.input_embedding(token)
+        h, h_lengths = self.encoder(token, token_len)
+        h, h_lengths = self.length_regulator(h, feat_len)
+        # get conditions
+        if self.cond:
+            conds = torch.zeros(feat.shape, device=token.device)
+            for i, j in enumerate(feat_len):
+                if random.random() < 0.5:
+                    continue
+                index = random.randint(0, int(0.3 * j))
+                conds[i, :index] = feat[i, :index]
+            conds = conds.transpose(1, 2)
+        else:
+            conds = None
+        mask = (~make_pad_mask(feat_len)).to(h)
+        loss, _ = self.decoder.compute_loss(
+                feat,
+                mask.unsqueeze(1),
+                h.transpose(1, 2).contiguous(),
+                None,
+                cond=conds
+        )
+        return {'loss': loss}
+    @torch.inference_mode()
+    def inference(self,
+                  token,
+                  token_len,
+                  sample_rate):
+        assert token.shape[0] == 1
+        token = self.input_embedding(torch.clamp(token, min=0))
+        h, h_lengths = self.encoder(token, token_len)
+        if sample_rate == 48000:
+            token_len = 2 * token_len
+        h, h_lengths = self.length_regulator(h, token_len)
+        # get conditions
+        conds = None
+        mask = (~make_pad_mask(token_len)).to(h)
+        feat = self.decoder(
+            mu=h.transpose(1, 2).contiguous(),
+            mask=mask.unsqueeze(1),
+            spks=None,
+            cond=conds,
+            n_timesteps=10
+        )
+        return feat

inspiremusic/flow/flow_matching.py ADDED Viewed

	@@ -0,0 +1,167 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn.functional as F
+from matcha.models.components.flow_matching import BASECFM
+class ConditionalCFM(BASECFM):
+    def __init__(self, in_channels, cfm_params, estimator: torch.nn.Module = None):
+        super().__init__(
+            n_feats=in_channels,
+            cfm_params=cfm_params,
+        )
+        self.t_scheduler = cfm_params.t_scheduler
+        self.training_cfg_rate = cfm_params.training_cfg_rate
+        self.inference_cfg_rate = cfm_params.inference_cfg_rate
+        # Just change the architecture of the estimator here
+        self.estimator = estimator
+    @torch.inference_mode()
+    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
+        """Forward diffusion
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            n_timesteps (int): number of diffusion steps
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        z = torch.randn_like(mu) * temperature
+        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
+        if self.t_scheduler == 'cosine':
+            t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
+        return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond)
+    def solve_euler(self, x, t_span, mu, mask, spks, cond):
+        """
+        Fixed euler solver for ODEs.
+        Args:
+            x (torch.Tensor): random noise
+            t_span (torch.Tensor): n_timesteps interpolated
+                shape: (n_timesteps + 1,)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        """
+        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
+        t = t.unsqueeze(dim=0)
+        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
+        # Or in future might add like a return_all_steps flag
+        sol = []
+        for step in range(1, len(t_span)):
+            dphi_dt = self.forward_estimator(x, mask, mu, t, spks, cond)
+            # Classifier-Free Guidance inference introduced in VoiceBox
+            if self.inference_cfg_rate > 0:
+                cfg_dphi_dt = self.forward_estimator(
+                    x, mask,
+                    torch.zeros_like(mu), t,
+                    torch.zeros_like(spks) if spks is not None else None,
+                    torch.zeros_like(cond) if cond is not None else None
+                )
+                dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt -
+                           self.inference_cfg_rate * cfg_dphi_dt)
+            x = x + dt * dphi_dt
+            t = t + dt
+            sol.append(x)
+            if step < len(t_span) - 1:
+                dt = t_span[step + 1] - t
+        return sol[-1]
+    def forward_estimator(self, x, mask, mu, t, spks, cond):
+        if isinstance(self.estimator, torch.nn.Module):
+            return self.estimator.forward(x, mask, mu, t, spks, cond)
+        elif isinstance(self.estimator, onnxruntime.InferenceSession):
+            ort_inputs = {
+                'x': x.cpu().numpy(),
+                'mask': mask.cpu().numpy(),
+                'mu': mu.cpu().numpy(),
+                't': t.cpu().numpy(),
+                'spks': spks.cpu().numpy(),
+                'cond': cond.cpu().numpy()
+            }
+            output = self.estimator.run(None, ort_inputs)[0]
+            return torch.tensor(output, dtype=x.dtype, device=x.device)
+        else:
+            self.estimator.set_input_shape('x', (2, 80, x.size(2)))
+            self.estimator.set_input_shape('mask', (2, 1, x.size(2)))
+            self.estimator.set_input_shape('mu', (2, 80, x.size(2)))
+            self.estimator.set_input_shape('t', (2,))
+            self.estimator.set_input_shape('spks', (2, 80))
+            self.estimator.set_input_shape('cond', (2, 80, x.size(2)))
+            # run trt engine
+            self.estimator.execute_v2([x.contiguous().data_ptr(),
+                                       mask.contiguous().data_ptr(),
+                                       mu.contiguous().data_ptr(),
+                                       t.contiguous().data_ptr(),
+                                       spks.contiguous().data_ptr(),
+                                       cond.contiguous().data_ptr(),
+                                       x.data_ptr()])
+            return x
+    def compute_loss(self, x1, mask, mu, spks=None, cond=None):
+        """Computes diffusion loss
+        Args:
+            x1 (torch.Tensor): Target
+                shape: (batch_size, n_feats, mo)
+            mask (torch.Tensor): target mask
+                shape: (batch_size, 1, mel_timesteps)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            spks (torch.Tensor, optional): speaker embedding. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+        Returns:
+            loss: conditional flow matching loss
+            y: conditional flow
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        b, _, t = mu.shape
+        t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
+        if self.t_scheduler == 'cosine':
+            t = 1 - torch.cos(t * 0.5 * torch.pi)
+        z = torch.randn_like(x1)
+        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
+        u = x1 - (1 - self.sigma_min) * z
+        # during training, we randomly drop condition to trade off mode coverage and sample fidelity
+        if self.training_cfg_rate > 0:
+            cfg_mask = torch.rand(b, device=x1.device) > self.training_cfg_rate
+            mu = mu * cfg_mask.view(-1, 1, 1)
+            if cond is not None:
+                cond = cond * cfg_mask.view(-1, 1, 1)
+        pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond)
+        loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
+        return loss, y

inspiremusic/flow/length_regulator.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Tuple
+import torch.nn as nn
+import torch
+from torch.nn import functional as F
+from inspiremusic.utils.mask import make_pad_mask
+class InterpolateRegulator(nn.Module):
+    def __init__(
+            self,
+            channels: int,
+            sampling_ratios: Tuple,
+            out_channels: int = None,
+            groups: int = 1,
+    ):
+        super().__init__()
+        self.sampling_ratios = sampling_ratios
+        out_channels = out_channels or channels
+        model = nn.ModuleList([])
+        if len(sampling_ratios) > 0:
+            for _ in sampling_ratios:
+                module = nn.Conv1d(channels, channels, 3, 1, 1)
+                norm = nn.GroupNorm(groups, channels)
+                act = nn.Mish()
+                model.extend([module, norm, act])
+        model.append(
+            nn.Conv1d(channels, out_channels, 1, 1)
+        )
+        self.model = nn.Sequential(*model)
+    def forward(self, x, ylens=None):
+        # x in (B, T, D)
+        mask = (~make_pad_mask(ylens)).to(x).unsqueeze(-1)
+        x = F.interpolate(x.transpose(1, 2).contiguous(), size=ylens.max(), mode='linear')
+        out = self.model(x).transpose(1, 2).contiguous()
+        olens = ylens
+        return out * mask, olens
+    def inference(self, x1, x2, mel_len1, mel_len2, input_frame_rate=50):
+        # in inference mode, interploate prompt token and token(head/mid/tail) seprately, so we can get a clear separation point of mel
+        # x in (B, T, D)
+        if x2.shape[1] > 40:
+            x2_head = F.interpolate(x2[:, :20].transpose(1, 2).contiguous(), size=int(20 / input_frame_rate * 22050 / 256), mode='linear')
+            x2_mid = F.interpolate(x2[:, 20:-20].transpose(1, 2).contiguous(), size=mel_len2 - int(20 / input_frame_rate * 22050 / 256) * 2,
+                                   mode='linear')
+            x2_tail = F.interpolate(x2[:, -20:].transpose(1, 2).contiguous(), size=int(20 / input_frame_rate * 22050 / 256), mode='linear')
+            x2 = torch.concat([x2_head, x2_mid, x2_tail], dim=2)
+        else:
+            x2 = F.interpolate(x2.transpose(1, 2).contiguous(), size=mel_len2, mode='linear')
+        if x1.shape[1] != 0:
+            x1 = F.interpolate(x1.transpose(1, 2).contiguous(), size=mel_len1, mode='linear')
+            x = torch.concat([x1, x2], dim=2)
+        else:
+            x = x2
+        out = self.model(x).transpose(1, 2).contiguous()
+        return out, mel_len1 + mel_len2

inspiremusic/hifigan/discriminator.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import torch
+import torch.nn as nn
+from torch.nn.utils import weight_norm
+from typing import List, Optional, Tuple
+from einops import rearrange
+from torchaudio.transforms import Spectrogram
+class MultipleDiscriminator(nn.Module):
+    def __init__(
+            self, mpd: nn.Module, mrd: nn.Module
+    ):
+        super().__init__()
+        self.mpd = mpd
+        self.mrd = mrd
+    def forward(self, y: torch.Tensor, y_hat: torch.Tensor):
+        y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], []
+        this_y_d_rs, this_y_d_gs, this_fmap_rs, this_fmap_gs = self.mpd(y.unsqueeze(dim=1), y_hat.unsqueeze(dim=1))
+        y_d_rs += this_y_d_rs
+        y_d_gs += this_y_d_gs
+        fmap_rs += this_fmap_rs
+        fmap_gs += this_fmap_gs
+        this_y_d_rs, this_y_d_gs, this_fmap_rs, this_fmap_gs = self.mrd(y, y_hat)
+        y_d_rs += this_y_d_rs
+        y_d_gs += this_y_d_gs
+        fmap_rs += this_fmap_rs
+        fmap_gs += this_fmap_gs
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class MultiResolutionDiscriminator(nn.Module):
+    def __init__(
+        self,
+        fft_sizes: Tuple[int, ...] = (2048, 1024, 512),
+        num_embeddings: Optional[int] = None,
+    ):
+        """
+        Multi-Resolution Discriminator module adapted from https://github.com/descriptinc/descript-audio-codec.
+        Additionally, it allows incorporating conditional information with a learned embeddings table.
+        Args:
+            fft_sizes (tuple[int]): Tuple of window lengths for FFT. Defaults to (2048, 1024, 512).
+            num_embeddings (int, optional): Number of embeddings. None means non-conditional discriminator.
+                Defaults to None.
+        """
+        super().__init__()
+        self.discriminators = nn.ModuleList(
+            [DiscriminatorR(window_length=w, num_embeddings=num_embeddings) for w in fft_sizes]
+        )
+    def forward(
+        self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
+    ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for d in self.discriminators:
+            y_d_r, fmap_r = d(x=y, cond_embedding_id=bandwidth_id)
+            y_d_g, fmap_g = d(x=y_hat, cond_embedding_id=bandwidth_id)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorR(nn.Module):
+    def __init__(
+        self,
+        window_length: int,
+        num_embeddings: Optional[int] = None,
+        channels: int = 32,
+        hop_factor: float = 0.25,
+        bands: Tuple[Tuple[float, float], ...] = ((0.0, 0.1), (0.1, 0.25), (0.25, 0.5), (0.5, 0.75), (0.75, 1.0)),
+    ):
+        super().__init__()
+        self.window_length = window_length
+        self.hop_factor = hop_factor
+        self.spec_fn = Spectrogram(
+            n_fft=window_length, hop_length=int(window_length * hop_factor), win_length=window_length, power=None
+        )
+        n_fft = window_length // 2 + 1
+        bands = [(int(b[0] * n_fft), int(b[1] * n_fft)) for b in bands]
+        self.bands = bands
+        convs = lambda: nn.ModuleList(
+            [
+                weight_norm(nn.Conv2d(2, channels, (3, 9), (1, 1), padding=(1, 4))),
+                weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
+                weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
+                weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
+                weight_norm(nn.Conv2d(channels, channels, (3, 3), (1, 1), padding=(1, 1))),
+            ]
+        )
+        self.band_convs = nn.ModuleList([convs() for _ in range(len(self.bands))])
+        if num_embeddings is not None:
+            self.emb = torch.nn.Embedding(num_embeddings=num_embeddings, embedding_dim=channels)
+            torch.nn.init.zeros_(self.emb.weight)
+        self.conv_post = weight_norm(nn.Conv2d(channels, 1, (3, 3), (1, 1), padding=(1, 1)))
+    def spectrogram(self, x):
+        # Remove DC offset
+        x = x - x.mean(dim=-1, keepdims=True)
+        # Peak normalize the volume of input audio
+        x = 0.8 * x / (x.abs().max(dim=-1, keepdim=True)[0] + 1e-9)
+        x = self.spec_fn(x)
+        x = torch.view_as_real(x)
+        x = rearrange(x, "b f t c -> b c t f")
+        # Split into bands
+        x_bands = [x[..., b[0]: b[1]] for b in self.bands]
+        return x_bands
+    def forward(self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None):
+        x_bands = self.spectrogram(x)
+        fmap = []
+        x = []
+        for band, stack in zip(x_bands, self.band_convs):
+            for i, layer in enumerate(stack):
+                band = layer(band)
+                band = torch.nn.functional.leaky_relu(band, 0.1)
+                if i > 0:
+                    fmap.append(band)
+            x.append(band)
+        x = torch.cat(x, dim=-1)
+        if cond_embedding_id is not None:
+            emb = self.emb(cond_embedding_id)
+            h = (emb.view(1, -1, 1, 1) * x).sum(dim=1, keepdims=True)
+        else:
+            h = 0
+        x = self.conv_post(x)
+        fmap.append(x)
+        x += h
+        return x, fmap

inspiremusic/hifigan/f0_predictor.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from torch.nn.utils import weight_norm
+class ConvRNNF0Predictor(nn.Module):
+    def __init__(self,
+                 num_class: int = 1,
+                 in_channels: int = 80,
+                 cond_channels: int = 512
+                 ):
+        super().__init__()
+        self.num_class = num_class
+        self.condnet = nn.Sequential(
+            weight_norm(
+                nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+        )
+        self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.condnet(x)
+        x = x.transpose(1, 2)
+        return torch.abs(self.classifier(x).squeeze(-1))

inspiremusic/hifigan/generator.py ADDED Viewed

	@@ -0,0 +1,411 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""HIFI-GAN"""
+from typing import Dict, Optional, List
+import numpy as np
+from scipy.signal import get_window
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Conv1d
+from torch.nn import ConvTranspose1d
+from torch.nn.utils import remove_weight_norm
+from torch.nn.utils import weight_norm
+from torch.distributions.uniform import Uniform
+from inspiremusic.transformer.activation import Snake
+from inspiremusic.utils.common import get_padding
+from inspiremusic.utils.common import init_weights
+"""hifigan based generator implementation.
+This code is modified from https://github.com/jik876/hifi-gan
+ ,https://github.com/kan-bayashi/ParallelWaveGAN and
+ https://github.com/NVIDIA/BigVGAN
+"""
+class ResBlock(torch.nn.Module):
+    """Residual block module in HiFiGAN/BigVGAN."""
+    def __init__(
+        self,
+        channels: int = 512,
+        kernel_size: int = 3,
+        dilations: List[int] = [1, 3, 5],
+    ):
+        super(ResBlock, self).__init__()
+        self.convs1 = nn.ModuleList()
+        self.convs2 = nn.ModuleList()
+        for dilation in dilations:
+            self.convs1.append(
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation,
+                        padding=get_padding(kernel_size, dilation)
+                    )
+                )
+            )
+            self.convs2.append(
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1)
+                    )
+                )
+            )
+        self.convs1.apply(init_weights)
+        self.convs2.apply(init_weights)
+        self.activations1 = nn.ModuleList([
+            Snake(channels, alpha_logscale=False)
+            for _ in range(len(self.convs1))
+        ])
+        self.activations2 = nn.ModuleList([
+            Snake(channels, alpha_logscale=False)
+            for _ in range(len(self.convs2))
+        ])
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for idx in range(len(self.convs1)):
+            xt = self.activations1[idx](x)
+            xt = self.convs1[idx](xt)
+            xt = self.activations2[idx](xt)
+            xt = self.convs2[idx](xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for idx in range(len(self.convs1)):
+            remove_weight_norm(self.convs1[idx])
+            remove_weight_norm(self.convs2[idx])
+class SineGen(torch.nn.Module):
+    """ Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(self, samp_rate, harmonic_num=0,
+                 sine_amp=0.1, noise_std=0.003,
+                 voiced_threshold=0):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = (f0 > self.voiced_threshold).type(torch.float32)
+        return uv
+    @torch.no_grad()
+    def forward(self, f0):
+        """
+        :param f0: [B, 1, sample_len], Hz
+        :return: [B, 1, sample_len]
+        """
+        F_mat = torch.zeros((f0.size(0), self.harmonic_num + 1, f0.size(-1))).to(f0.device)
+        for i in range(self.harmonic_num + 1):
+            F_mat[:, i: i + 1, :] = f0 * (i + 1) / self.sampling_rate
+        theta_mat = 2 * np.pi * (torch.cumsum(F_mat, dim=-1) % 1)
+        u_dist = Uniform(low=-np.pi, high=np.pi)
+        phase_vec = u_dist.sample(sample_shape=(f0.size(0), self.harmonic_num + 1, 1)).to(F_mat.device)
+        phase_vec[:, 0, :] = 0
+        # generate sine waveforms
+        sine_waves = self.sine_amp * torch.sin(theta_mat + phase_vec)
+        # generate uv signal
+        uv = self._f02uv(f0)
+        # noise: for unvoiced should be similar to sine_amp
+        #        std = self.sine_amp/3 -> max value ~ self.sine_amp
+        # .       for voiced regions is self.noise_std
+        noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+        noise = noise_amp * torch.randn_like(sine_waves)
+        # first: set the unvoiced part to 0 by uv
+        # then: additive noise
+        sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+class SourceModuleHnNSF(torch.nn.Module):
+    """ SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+    def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0):
+        super(SourceModuleHnNSF, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
+                                 sine_amp, add_noise_std, voiced_threshod)
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+    def forward(self, x):
+        """
+        Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+        F0_sampled (batchsize, length, 1)
+        Sine_source (batchsize, length, 1)
+        noise_source (batchsize, length 1)
+        """
+        # source for harmonic branch
+        with torch.no_grad():
+            sine_wavs, uv, _ = self.l_sin_gen(x.transpose(1, 2))
+            sine_wavs = sine_wavs.transpose(1, 2)
+            uv = uv.transpose(1, 2)
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        # source for noise branch, in the same shape as uv
+        noise = torch.randn_like(uv) * self.sine_amp / 3
+        return sine_merge, noise, uv
+class HiFTGenerator(nn.Module):
+    """
+    HiFTNet Generator: Neural Source Filter + ISTFTNet
+    https://arxiv.org/abs/2309.09493
+    """
+    def __init__(
+            self,
+            in_channels: int = 80,
+            base_channels: int = 512,
+            nb_harmonics: int = 8,
+            sampling_rate: int = 22050,
+            nsf_alpha: float = 0.1,
+            nsf_sigma: float = 0.003,
+            nsf_voiced_threshold: float = 10,
+            upsample_rates: List[int] = [8, 8],
+            upsample_kernel_sizes: List[int] = [16, 16],
+            istft_params: Dict[str, int] = {"n_fft": 16, "hop_len": 4},
+            resblock_kernel_sizes: List[int] = [3, 7, 11],
+            resblock_dilation_sizes: List[List[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+            source_resblock_kernel_sizes: List[int] = [7, 11],
+            source_resblock_dilation_sizes: List[List[int]] = [[1, 3, 5], [1, 3, 5]],
+            lrelu_slope: float = 0.1,
+            audio_limit: float = 0.99,
+            f0_predictor: torch.nn.Module = None,
+    ):
+        super(HiFTGenerator, self).__init__()
+        self.out_channels = 1
+        self.nb_harmonics = nb_harmonics
+        self.sampling_rate = sampling_rate
+        self.istft_params = istft_params
+        self.lrelu_slope = lrelu_slope
+        self.audio_limit = audio_limit
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.m_source = SourceModuleHnNSF(
+            sampling_rate=sampling_rate,
+            upsample_scale=np.prod(upsample_rates) * istft_params["hop_len"],
+            harmonic_num=nb_harmonics,
+            sine_amp=nsf_alpha,
+            add_noise_std=nsf_sigma,
+            voiced_threshod=nsf_voiced_threshold)
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates) * istft_params["hop_len"])
+        self.conv_pre = weight_norm(
+            Conv1d(in_channels, base_channels, 7, 1, padding=3)
+        )
+        # Up
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        base_channels // (2**i),
+                        base_channels // (2**(i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        # Down
+        self.source_downs = nn.ModuleList()
+        self.source_resblocks = nn.ModuleList()
+        downsample_rates = [1] + upsample_rates[::-1][:-1]
+        downsample_cum_rates = np.cumprod(downsample_rates)
+        for i, (u, k, d) in enumerate(zip(downsample_cum_rates[::-1], source_resblock_kernel_sizes, source_resblock_dilation_sizes)):
+            if u == 1:
+                self.source_downs.append(
+                    Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), 1, 1)
+                )
+            else:
+                self.source_downs.append(
+                    Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), u * 2, u, padding=(u // 2))
+                )
+            self.source_resblocks.append(
+                ResBlock(base_channels // (2 ** (i + 1)), k, d)
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = base_channels // (2**(i + 1))
+            for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(ResBlock(ch, k, d))
+        self.conv_post = weight_norm(Conv1d(ch, istft_params["n_fft"] + 2, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+        self.reflection_pad = nn.ReflectionPad1d((1, 0))
+        self.stft_window = torch.from_numpy(get_window("hann", istft_params["n_fft"], fftbins=True).astype(np.float32))
+        self.f0_predictor = f0_predictor
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+        self.m_source.remove_weight_norm()
+        for l in self.source_downs:
+            remove_weight_norm(l)
+        for l in self.source_resblocks:
+            l.remove_weight_norm()
+    def _stft(self, x):
+        spec = torch.stft(
+            x,
+            self.istft_params["n_fft"], self.istft_params["hop_len"], self.istft_params["n_fft"], window=self.stft_window.to(x.device),
+            return_complex=True)
+        spec = torch.view_as_real(spec)  # [B, F, TT, 2]
+        return spec[..., 0], spec[..., 1]
+    def _istft(self, magnitude, phase):
+        magnitude = torch.clip(magnitude, max=1e2)
+        real = magnitude * torch.cos(phase)
+        img = magnitude * torch.sin(phase)
+        inverse_transform = torch.istft(torch.complex(real, img), self.istft_params["n_fft"], self.istft_params["hop_len"],
+                                        self.istft_params["n_fft"], window=self.stft_window.to(magnitude.device))
+        return inverse_transform
+    def decode(self, x: torch.Tensor, s: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
+        s_stft_real, s_stft_imag = self._stft(s.squeeze(1))
+        s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, self.lrelu_slope)
+            x = self.ups[i](x)
+            if i == self.num_upsamples - 1:
+                x = self.reflection_pad(x)
+            # fusion
+            si = self.source_downs[i](s_stft)
+            si = self.source_resblocks[i](si)
+            x = x + si
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        magnitude = torch.exp(x[:, :self.istft_params["n_fft"] // 2 + 1, :])
+        phase = torch.sin(x[:, self.istft_params["n_fft"] // 2 + 1:, :])  # actually, sin is redundancy
+        x = self._istft(magnitude, phase)
+        x = torch.clamp(x, -self.audio_limit, self.audio_limit)
+        return x
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        speech_feat = batch['speech_feat'].transpose(1, 2).to(device)
+        # mel->f0
+        f0 = self.f0_predictor(speech_feat)
+        # f0->source
+        s = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+        s, _, _ = self.m_source(s)
+        s = s.transpose(1, 2)
+        # mel+source->speech
+        generated_speech = self.decode(x=speech_feat, s=s)
+        return generated_speech, f0
+    @torch.inference_mode()
+    def inference(self, speech_feat: torch.Tensor, cache_source: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
+        # mel->f0
+        f0 = self.f0_predictor(speech_feat)
+        # f0->source
+        s = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+        s, _, _ = self.m_source(s)
+        s = s.transpose(1, 2)
+        # use cache_source to avoid glitch
+        if cache_source.shape[2] != 0:
+            s[:, :, :cache_source.shape[2]] = cache_source
+        generated_speech = self.decode(x=speech_feat, s=s)
+        return generated_speech, s

inspiremusic/hifigan/hifigan.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from typing import Dict, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from matcha.hifigan.models import feature_loss, generator_loss, discriminator_loss
+from inspiremusic.utils.losses import tpr_loss, mel_loss
+class HiFiGan(nn.Module):
+    def __init__(self, generator, discriminator, mel_spec_transform,
+                 multi_mel_spectral_recon_loss_weight=45, feat_match_loss_weight=2.0,
+                 tpr_loss_weight=1.0, tpr_loss_tau=0.04):
+        super(HiFiGan, self).__init__()
+        self.generator = generator
+        self.discriminator = discriminator
+        self.mel_spec_transform = mel_spec_transform
+        self.multi_mel_spectral_recon_loss_weight = multi_mel_spectral_recon_loss_weight
+        self.feat_match_loss_weight = feat_match_loss_weight
+        self.tpr_loss_weight = tpr_loss_weight
+        self.tpr_loss_tau = tpr_loss_tau
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        if batch['turn'] == 'generator':
+            return self.forward_generator(batch, device)
+        else:
+            return self.forward_discriminator(batch, device)
+    def forward_generator(self, batch, device):
+        real_speech = batch['speech'].to(device)
+        pitch_feat = batch['pitch_feat'].to(device)
+        # 1. calculate generator outputs
+        generated_speech, generated_f0 = self.generator(batch, device)
+        # 2. calculate discriminator outputs
+        y_d_rs, y_d_gs, fmap_rs, fmap_gs = self.discriminator(real_speech, generated_speech)
+        # 3. calculate generator losses, feature loss, mel loss, tpr losses [Optional]
+        loss_gen, _ = generator_loss(y_d_gs)
+        loss_fm = feature_loss(fmap_rs, fmap_gs)
+        loss_mel = mel_loss(real_speech, generated_speech, self.mel_spec_transform)
+        if self.tpr_loss_weight != 0:
+            loss_tpr = tpr_loss(y_d_rs, y_d_gs, self.tpr_loss_tau)
+        else:
+            loss_tpr = torch.zeros(1).to(device)
+        loss_f0 = F.l1_loss(generated_f0, pitch_feat)
+        loss = loss_gen + self.feat_match_loss_weight * loss_fm + \
+            self.multi_mel_spectral_recon_loss_weight * loss_mel + \
+            self.tpr_loss_weight * loss_tpr + loss_f0
+        return {'loss': loss, 'loss_gen': loss_gen, 'loss_fm': loss_fm, 'loss_mel': loss_mel, 'loss_tpr': loss_tpr, 'loss_f0': loss_f0}
+    def forward_discriminator(self, batch, device):
+        real_speech = batch['speech'].to(device)
+        # 1. calculate generator outputs
+        with torch.no_grad():
+            generated_speech, generated_f0 = self.generator(batch, device)
+        # 2. calculate discriminator outputs
+        y_d_rs, y_d_gs, fmap_rs, fmap_gs = self.discriminator(real_speech, generated_speech)
+        # 3. calculate discriminator losses, tpr losses [Optional]
+        loss_disc, _, _ = discriminator_loss(y_d_rs, y_d_gs)
+        if self.tpr_loss_weight != 0:
+            loss_tpr = tpr_loss(y_d_rs, y_d_gs, self.tpr_loss_tau)
+        else:
+            loss_tpr = torch.zeros(1).to(device)
+        loss = loss_disc + self.tpr_loss_weight * loss_tpr
+        return {'loss': loss, 'loss_disc': loss_disc, 'loss_tpr': loss_tpr}

inspiremusic/llm/llm.py ADDED Viewed

	@@ -0,0 +1,402 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, Optional, Callable, List, Generator
+import torch
+from torch import nn
+from torch.nn.utils.rnn import pad_sequence, unpad_sequence
+from inspiremusic.utils.common import IGNORE_ID
+from inspiremusic.transformer.label_smoothing_loss import LabelSmoothingLoss
+from inspiremusic.utils.common import th_accuracy
+from torch import Tensor
+from math import log
+from einops import rearrange, reduce, repeat
+import logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+class SinusoidalEmbedding(nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x: Tensor) -> Tensor:
+        device, half_dim = x.device, self.dim // 2
+        emb = torch.tensor(log(10000) / (half_dim - 1), device=device)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = rearrange(x, "i -> i 1") * rearrange(emb, "j -> 1 j")
+        return torch.cat((emb.sin(), emb.cos()), dim=-1).to(torch.float32)
+class LLM(torch.nn.Module):
+    def __init__(
+            self,
+            text_encoder_input_size: int,
+            llm_input_size: int,
+            llm_output_size: int,
+            audio_token_size: int,
+            llm: torch.nn.Module,
+            sampling: Callable,
+            text_encoder_conf: Dict = None,
+            length_normalized_loss: bool = True,
+            lsm_weight: float = 0.0,
+            frozen_input_embed: bool = False,
+            **kwargs,
+    ):
+        super().__init__()
+        self.llm_input_size = llm_input_size
+        self.audio_token_size = audio_token_size
+        # 1. build text token inputs related modules
+        if llm is None:
+            self.text_embedding = torch.nn.Embedding(text_token_size, text_encoder_input_size)
+        else:
+            self.text_embedding = llm.model.model.embed_tokens
+            if frozen_input_embed:
+                print("Freezing input embedding layer")
+                for p in self.text_embedding.parameters():
+                    p.requires_grad = False
+        self.chorus_embedding = torch.nn.Embedding(5, llm_input_size)  # intro, chorus, verse1, verse2 , outro
+        self.text_encoder_conf = text_encoder_conf
+        self.text_encoder = self.build_encoder(text_encoder_conf)
+        self.infer_cfg_ratio = kwargs.get("infer_cfg_ratio", None)
+        logging.info(f"infer_cfg_ratio: {self.infer_cfg_ratio}")
+        self.train_cfg_ratio = kwargs.get("train_cfg_ratio", None)
+        logging.info(f"train_cfg_ratio: {self.train_cfg_ratio}")
+        # 2. build audio token language model related modules
+        self.sos_eos = 0
+        self.task_id = 1
+        self.llm_embedding = torch.nn.Embedding(2, llm_input_size)
+        self.llm = llm
+        self.llm_decoder = nn.Linear(llm_output_size, audio_token_size + 1)
+        self.criterion_ce = LabelSmoothingLoss(
+                size=audio_token_size + 1,
+                padding_idx=IGNORE_ID,
+                smoothing=lsm_weight,
+                normalize_length=length_normalized_loss,
+        )
+        # 3. [Optional] build audio token related modules
+        self.speech_embedding = torch.nn.Embedding(audio_token_size, llm_input_size)
+        self.spk_embed_affine_layer = torch.nn.Linear(192, llm_input_size)
+        self.num_codebooks = 4
+        # 4. sampling method
+        self.sampling = sampling
+        self.time_embedding = SinusoidalEmbedding(llm_input_size)
+    def cfg_dropout(self, text_token, text_token_len, p):
+        # Classifier-Free Guidance Dropout
+        B = text_token.size(0)
+        num_samples_to_mask = int(p * B)
+        if num_samples_to_mask == 0:
+            num_samples_to_mask = 1
+        indices_to_mask = torch.randperm(B, device=text_token.device)[:num_samples_to_mask]
+        text_token[indices_to_mask] = 0
+        text_token_len[indices_to_mask] = 0
+        return text_token, text_token_len
+    def build_encoder(self, encoder_conf=None):
+        if encoder_conf is None:
+            assert hasattr(self, "encoder_conf"), \
+                "function param encoder_conf is None and model doesn't has encoder_conf attribute either."
+            encoder_conf = self.encoder_conf
+        encoder_name = encoder_conf.pop("name", "transformer")
+        model = None
+        if encoder_name == "transformer":
+            from inspiremusic.transformer.encoder.conformer_encoder import ConformerEncoder
+            model = ConformerEncoder(
+                    **encoder_conf,
+                    input_size=self.input_size,
+                    use_cnn_module=False,
+                    macaron_style=False,
+            )
+        elif encoder_name == "conformer":
+            from inspiremusic.transformer.encoder.conformer_encoder import ConformerEncoder
+            model = ConformerEncoder(
+                    **encoder_conf,
+                    input_size=self.input_size,
+            )
+        elif encoder_name == "llama_encoder":
+            from inspiremusic.transformer.encoder.llama_encoder import LlamaEncoder
+            model = LlamaEncoder(
+                    **encoder_conf,
+                    input_size=self.input_size,
+            )
+        elif encoder_name == "qwen2":
+            from inspiremusic.transformer.encoder.qwen_encoder import QwenEncoder
+            model = QwenEncoder(
+                    **encoder_conf,
+                    input_size=self.input_size,
+            )
+        elif encoder_name == "qwen2.5":
+            from inspiremusic.transformer.encoder.qwen_encoder import QwenEncoder
+            model = QwenEncoder(
+                    **encoder_conf,
+                    input_size=self.input_size,
+            )
+        encoder_conf["name"] = encoder_name
+        return model
+    def encode(self,
+            text: torch.Tensor,
+            text_lengths: torch.Tensor):
+        if self.text_encoder is not None:
+            encoder_out, encoder_mask = self.text_encoder(text, text_lengths,
+                                                          decoding_chunk_size=1,
+                                                          num_decoding_left_chunks=-1)
+            encoder_out_lens = encoder_mask.squeeze(1).sum(1)
+            encoder_out = self.text_encoder_affine_layer(encoder_out)
+        else:
+            encoder_out, encoder_out_lens = text, text_lengths
+        return encoder_out, encoder_out_lens
+    def pad_unpad_sequence(self, sos_eos_emb, embeddings, text_token,
+                           text_token_len, task_id_emb, audio_token,
+                           audio_token_len, seg_len):
+        text_token = unpad_sequence(text_token, text_token_len.cpu(),
+                                    batch_first=True)
+        audio_token = unpad_sequence(audio_token, audio_token_len.cpu(),
+                                     batch_first=True)
+        for i in range(len(embeddings)):
+            embeddings[i] = unpad_sequence(embeddings[i], seg_len.cpu(), batch_first=True)
+        lm_input = [torch.concat([sos_eos_emb.squeeze(dim=0)] + [embedding[i] for embedding in embeddings] + [text_token[i], task_id_emb.squeeze(dim=0), audio_token[i]], dim=0) for i in range(len(text_token))]
+        lm_input_len = torch.tensor([i.size(0) for i in lm_input], dtype=torch.int32)
+        lm_input = pad_sequence(lm_input, batch_first=True, padding_value=IGNORE_ID)
+        return lm_input, lm_input_len
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        """
+        Args:
+            text: (B, L, D)
+            text_lengths: (B,)
+            audio: (B, T, N) or (B, T)
+            audio_lengths: (B,)
+        """
+        mask = True
+        text_token = batch['text_token'].to(device)
+        text_token_len = batch['text_token_len'].to(device)
+        if "semantic_token" not in batch:
+            audio_token = batch['acoustic_token'].to(device)
+            audio_token_len = batch['acoustic_token_len'].to(device)
+            audio_token = audio_token.view(audio_token.size(0), -1, self.num_codebooks)
+            audio_token = audio_token[:, :, 0]
+            audio_token_len = (audio_token_len / self.num_codebooks).long()
+        else:
+            audio_token = batch['semantic_token'].to(device)
+            audio_token_len = batch['semantic_token_len'].to(device)
+        time_start = batch['time_start'].to(device)
+        time_end = batch['time_end'].to(device)
+        chorus = batch['chorus'].to(device)
+        # 1. encode text_token
+        if self.train_cfg_ratio > 0:
+            # Classifier-Free Guidance
+            text_token, _ = self.cfg_dropout(text_token, text_token_len, self.train_cfg_ratio)
+        # 2. Time Embedding & chorus embedding
+        text_token = self.text_embedding(text_token)
+        text_token, text_token_len = self.encode(text_token, text_token_len)
+        if mask:
+            time_mask = time_start != -1.0
+            seg_len = time_mask.sum(-1)
+            time_start = time_start.masked_fill(~time_mask, 0.0)
+            time_end = time_end.masked_fill(~time_mask, 0.0)
+            chorus = chorus.masked_fill(~time_mask, 0)
+            time_start_embed = self.time_embedding(time_start.view(-1)).to(text_token.dtype)
+            time_end_embed = self.time_embedding(time_end.view(-1)).to(text_token.dtype)
+            time_start_embed = time_start_embed.view(chorus.size(0), chorus.size(1), -1)
+            time_end_embed = time_end_embed.view(chorus.size(0), chorus.size(1), -1)
+            chorus_embed = self.chorus_embedding(chorus)
+            lm_target = [torch.tensor([IGNORE_ID] * (1 + 3 * seg_len[i] + text_token_len[i]) + audio_token[i,:audio_token_len[i]].tolist() + [self.audio_token_size]) for i in range(text_token.size(0))]
+        else:
+            time_start_embed = self.time_embedding(time_start).to(text_token.dtype)
+            time_end_embed = self.time_embedding(time_end).to(text_token.dtype)
+            chorus_embed = self.chorus_embedding(chorus)
+            lm_target = [torch.tensor(
+                [IGNORE_ID] * (4 + text_token_len[i]) + audio_token[i,:audio_token_len[i]].tolist() + [self.audio_token_size]) for i in range(text_token.size(0))]
+        lm_target = pad_sequence(lm_target, batch_first=True, padding_value=IGNORE_ID).to(device)
+        # 3. eos and task_id
+        sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
+        task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
+        # 4. encode audio_token
+        audio_token = self.speech_embedding(audio_token)
+        # 5. unpad and pad
+        lm_input, lm_input_len = self.pad_unpad_sequence(sos_eos_emb,
+                                                         [time_start_embed,
+                                                          time_end_embed,
+                                                          chorus_embed],
+                                                         text_token,
+                                                         text_token_len,
+                                                         task_id_emb,
+                                                         audio_token,
+                                                         audio_token_len,
+                                                         seg_len)
+        # 6. run lm forward
+        lm_output, lm_output_mask = self.llm(lm_input, lm_input_len.to(device))
+        logits = self.llm_decoder(lm_output)
+        loss = self.criterion_ce(logits, lm_target)
+        acc = th_accuracy(logits.view(-1, self.audio_token_size + 1), lm_target, ignore_label=IGNORE_ID)
+        return {'loss': loss, 'acc': acc}
+    def sampling_ids(
+            self,
+            weighted_scores: torch.Tensor,
+            decoded_tokens: List,
+            ignore_eos: bool = True,
+    ):
+        top_ids = self.sampling(weighted_scores, decoded_tokens)
+        return top_ids
+    @torch.inference_mode()
+    def inference(
+            self,
+            text: torch.Tensor,
+            text_len: torch.Tensor,
+            audio_token: torch.Tensor,
+            audio_token_len: torch.Tensor,
+            prompt_text: torch.Tensor,
+            prompt_text_len: torch.Tensor,
+            prompt_audio_token: torch.Tensor,
+            prompt_audio_token_len: torch.Tensor,
+            embeddings: List,
+            duration_to_gen: float = 30,
+            task: str = "continuation",
+            token_rate: int = 75,
+            limit_audio_prompt_len: int = 5,
+    ) -> Generator[torch.Tensor, None, None]:
+        device = text.device
+        if text is not None:
+            text = torch.concat([prompt_text, text], dim=1)
+            text_len += prompt_text_len
+            infer_cfg = self.infer_cfg_ratio >= 0.0
+            if infer_cfg:
+                text_cfg = self.text_embedding(text.new_zeros(text.shape))
+            text = self.text_embedding(text)
+            # 1. encode text
+            text, text_len = self.encode(text, text_len)
+        # 2. encode embedding
+        if embeddings is not None:
+            time_start, time_end, chorus = embeddings
+            if len(chorus.shape) == 1:
+                time_start_embed = self.time_embedding(time_start).reshape(1, 1, -1)  # .half()
+                time_end_embed = self.time_embedding(time_end).reshape(1, 1, -1)  # .half()
+                chorus_embed = self.chorus_embedding(chorus).reshape(1, 1, -1)  # .half()
+            else:
+                time_start_embed = self.time_embedding(
+                    time_start.view(-1)).reshape(1, chorus.size(1), -1)  # .half()
+                time_end_embed = self.time_embedding(time_end.view(-1)).reshape(1, chorus.size(1), -1)  # .half()
+                chorus_embed = self.chorus_embedding(chorus)  # .half()
+        # 3. concat llm_input
+        sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
+        task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
+        if audio_token_len:
+            audio_token = audio_token[:, :(limit_audio_prompt_len * token_rate)]
+            audio_token_emb = self.speech_embedding(audio_token)
+        else:
+            audio_token_emb = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device)
+        if prompt_audio_token_len:
+            prompt_audio_token_emb = self.speech_embedding(prompt_audio_token)
+        else:
+            prompt_audio_token_emb = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device)
+        # Check if removing prompt audio token will fail decoding.
+        if task == "continuation":
+            lm_input = torch.concat(
+                    [sos_eos_emb, time_start_embed, time_end_embed,
+                     chorus_embed, text, task_id_emb, audio_token_emb], dim=1)
+            if infer_cfg:
+                audio_cfg = self.speech_embedding(
+                    audio_token.new_zeros(audio_token.shape))
+                lm_cf_input = torch.concat(
+                        [sos_eos_emb, torch.rand_like(time_start_embed),
+                         torch.rand_like(time_end_embed),
+                         torch.rand_like(chorus_embed), text_cfg, task_id_emb,
+                         audio_cfg], dim=1)
+                lm_input = torch.cat([lm_input, lm_cf_input], 0)
+        else:
+            lm_input = torch.concat(
+                    [sos_eos_emb, time_start_embed, time_end_embed,
+                     chorus_embed, text, task_id_emb], dim=1)
+            if infer_cfg:
+                lm_cf_input = torch.concat(
+                        [sos_eos_emb, torch.rand_like(time_start_embed),
+                         torch.rand_like(time_end_embed),
+                         torch.rand_like(chorus_embed), text_cfg, task_id_emb],
+                        dim=1)
+                lm_input = torch.cat([lm_input, lm_cf_input], 0)
+        # 4. cal min/max_length
+        min_len = duration_to_gen * token_rate
+        max_len = duration_to_gen * token_rate
+        logging.info(
+            f"LLM generation sequence length: {max_len}, generate audio length {duration_to_gen}s.")
+        # 5. step by step decode
+        out_tokens = []
+        offset = 0
+        state = None
+        for i in range(int(max_len)):
+            y_pred, _, state = self.llm.forward_one_step(lm_input, torch.ones(lm_input.shape[0], lm_input.shape[1], device=lm_input.device).to(torch.bool), cache=state)
+            logits = self.llm_decoder(y_pred[:, -1])
+            if infer_cfg:
+                # perform context free guidance
+                logits_cf = logits[1]
+                logits = logits[0]
+                infer_cfg_ratio = self.infer_cfg_ratio
+                logits = infer_cfg_ratio * logits + (1 - infer_cfg_ratio) * logits_cf
+            logp = logits.log_softmax(dim=-1)
+            logp = logp.squeeze(dim=0)
+            top_ids = self.sampling_ids(logp, out_tokens, ignore_eos=i < min_len).item()
+            if top_ids == self.audio_token_size:
+                break
+            # # in stream mode, yield token one by one
+            yield torch.tensor([[top_ids]], dtype=torch.int64, device=device)
+            out_tokens.append(top_ids)
+            offset += lm_input.size(1)
+            lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
+            if infer_cfg:
+                lm_input = lm_input.repeat(2, 1, 1)

inspiremusic/metrics/clap_score.py ADDED Viewed

	@@ -0,0 +1,135 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import requests
+from tqdm import tqdm
+import torch
+import numpy as np
+import laion_clap
+from clap_module.factory import load_state_dict
+import librosa
+import pyloudnorm as pyln
+# following documentation from https://github.com/LAION-AI/CLAP
+def int16_to_float32(x):
+    return (x / 32767.0).astype(np.float32)
+def float32_to_int16(x):
+    x = np.clip(x, a_min=-1., a_max=1.)
+    return (x * 32767.).astype(np.int16)
+def clap_score(id2text, audio_path, audio_files_extension='.wav', clap_model='music_audioset_epoch_15_esc_90.14.pt'):
+    """
+    Cosine similarity is computed between the LAION-CLAP text embedding of the given prompt and
+    the LAION-CLAP audio embedding of the generated audio. LION-CLAP: https://github.com/LAION-AI/CLAP
+    This evaluation script assumes that audio_path files are identified with the ids in id2text.
+    clap_score() evaluates all ids in id2text.
+    GPU-based computation.
+    Select one of the following models from https://github.com/LAION-AI/CLAP:
+        - music_speech_audioset_epoch_15_esc_89.98.pt (used by musicgen)
+        - music_audioset_epoch_15_esc_90.14.pt
+        - music_speech_epoch_15_esc_89.25.pt
+        - 630k-audioset-fusion-best.pt (our default, with "fusion" to handle longer inputs)
+    Params:
+    -- id2text: dictionary with the mapping between id (generated audio filenames in audio_path)
+                and text (prompt used to generate audio). clap_score() evaluates all ids in id2text.
+    -- audio_path: path where the generated audio files to evaluate are available.
+    -- audio_files_extension: files extension (default .wav) in eval_path.
+    -- clap_model: choose one of the above clap_models (default: '630k-audioset-fusion-best.pt').
+    Returns:
+    -- CLAP-LION score
+    """
+    # load model
+    if clap_model == 'music_speech_audioset_epoch_15_esc_89.98.pt':
+        url = 'https://huggingface.co/lukewys/laion_clap/resolve/main/music_speech_audioset_epoch_15_esc_89.98.pt'
+        clap_path = 'CLAP/music_speech_audioset_epoch_15_esc_89.98.pt'
+        model = laion_clap.CLAP_Module(enable_fusion=False, amodel='HTSAT-base',  device='cuda')
+    elif clap_model == 'music_audioset_epoch_15_esc_90.14.pt':
+        url = 'https://huggingface.co/lukewys/laion_clap/resolve/main/music_audioset_epoch_15_esc_90.14.pt'
+        clap_path = 'CLAP/music_audioset_epoch_15_esc_90.14.pt'
+        model = laion_clap.CLAP_Module(enable_fusion=False, amodel='HTSAT-base',  device='cuda')
+    elif clap_model == 'music_speech_epoch_15_esc_89.25.pt':
+        url = 'https://huggingface.co/lukewys/laion_clap/resolve/main/music_speech_epoch_15_esc_89.25.pt'
+        clap_path = 'CLAP/music_speech_epoch_15_esc_89.25.pt'
+        model = laion_clap.CLAP_Module(enable_fusion=False, amodel='HTSAT-base',  device='cuda')
+    elif clap_model == '630k-audioset-fusion-best.pt':
+        url = 'https://huggingface.co/lukewys/laion_clap/resolve/main/630k-audioset-fusion-best.pt'
+        clap_path = 'CLAP/630k-audioset-fusion-best.pt'
+        model = laion_clap.CLAP_Module(enable_fusion=True, device='cuda')
+    else:
+        raise ValueError('clap_model not implemented')
+    # download clap_model if not already downloaded
+    if not os.path.exists(clap_path):
+        print('Downloading ', clap_model, '...')
+        os.makedirs(os.path.dirname(clap_path), exist_ok=True)
+        response = requests.get(url, stream=True)
+        total_size = int(response.headers.get('content-length', 0))
+        with open(clap_path, 'wb') as file:
+            with tqdm(total=total_size, unit='B', unit_scale=True) as progress_bar:
+                for data in response.iter_content(chunk_size=8192):
+                    file.write(data)
+                    progress_bar.update(len(data))
+    # fixing CLAP-LION issue, see: https://github.com/LAION-AI/CLAP/issues/118
+    pkg = load_state_dict(clap_path)
+    pkg.pop('text_branch.embeddings.position_ids', None)
+    model.model.load_state_dict(pkg)
+    model.eval()
+    if not os.path.isdir(audio_path):
+        raise ValueError(f'audio_path: {audio_path} does not exist')
+    if id2text:
+        print('[EXTRACTING TEXT EMBEDDINGS] ')
+        batch_size = 64
+        text_emb = {}
+        for i in tqdm(range(0, len(id2text), batch_size)):
+            batch_ids = list(id2text.keys())[i:i+batch_size]
+            batch_texts = [id2text[id] for id in batch_ids]
+            with torch.no_grad():
+                embeddings = model.get_text_embedding(batch_texts, use_tensor=True)
+            for id, emb in zip(batch_ids, embeddings):
+                text_emb[id] = emb
+    else:
+        raise ValueError('Must specify id2text')
+    print('[EVALUATING GENERATIONS] ', audio_path)
+    score = 0
+    count = 0
+    for id in tqdm(id2text.keys()):
+        file_path = os.path.join(audio_path, str(id)+audio_files_extension)
+        if os.path.isfile(file_path):
+            with torch.no_grad():
+                audio, _ = librosa.load(file_path, sr=48000, mono=True) # sample rate should be 48000
+                audio = pyln.normalize.peak(audio, -1.0)
+                audio = audio.reshape(1, -1) # unsqueeze (1,T)
+                audio = torch.from_numpy(int16_to_float32(float32_to_int16(audio))).float()
+                audio_embeddings = model.get_audio_embedding_from_data(x = audio, use_tensor=True)
+            cosine_sim = torch.nn.functional.cosine_similarity(audio_embeddings, text_emb[id].unsqueeze(0), dim=1, eps=1e-8)[0]
+            print(f"{id} | CLAP score = {cosine_sim}")
+            score += cosine_sim
+            count += 1
+    return score / count if count > 0 else 0

inspiremusic/metrics/openl3_fd.py ADDED Viewed

	@@ -0,0 +1,338 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import openl3
+import librosa
+import numpy as np
+from scipy import linalg
+import glob
+from tqdm import tqdm
+import os
+import soxr
+import pyloudnorm as pyln
+def calculate_embd_statistics(embd_lst):
+    if isinstance(embd_lst, list):
+        embd_lst = np.array(embd_lst)
+    mu = np.mean(embd_lst, axis=0)
+    sigma = np.cov(embd_lst, rowvar=False)
+    return mu, sigma
+def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
+    """
+    Adapted from: https://github.com/mseitzer/pytorch-fid/blob/master/src/pytorch_fid/fid_score.py
+    Adapted from: https://github.com/gudgud96/frechet-audio-distance/blob/main/frechet_audio_distance/fad.py
+    Numpy implementation of the Frechet Distance.
+    The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
+    and X_2 ~ N(mu_2, C_2) is
+            d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
+    Params:
+    -- mu1: Embedding's mean statistics for generated samples.
+    -- mu2: Embedding's mean statistics for reference samples.
+    -- sigma1: Covariance matrix over embeddings for generated samples.
+    -- sigma2: Covariance matrix over embeddings for reference samples.
+    Returns:
+    --  Fréchet Distance.
+    """
+    mu1 = np.atleast_1d(mu1)
+    mu2 = np.atleast_1d(mu2)
+    sigma1 = np.atleast_2d(sigma1)
+    sigma2 = np.atleast_2d(sigma2)
+    assert mu1.shape == mu2.shape, \
+        'Training and test mean vectors have different lengths'
+    assert sigma1.shape == sigma2.shape, \
+        'Training and test covariances have different dimensions'
+    diff = mu1 - mu2
+    # product might be almost singular
+    covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+    if not np.isfinite(covmean).all():
+        msg = ('fid calculation produces singular product; '
+            'adding %s to diagonal of cov estimates') % eps
+        print(msg)
+        offset = np.eye(sigma1.shape[0]) * eps
+        covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+    # numerical error might give slight imaginary component
+    if np.iscomplexobj(covmean):
+        if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+            m = np.max(np.abs(covmean.imag))
+            raise ValueError('Imaginary component {}'.format(m))
+        covmean = covmean.real
+    tr_covmean = np.trace(covmean)
+    return (diff.dot(diff) + np.trace(sigma1)
+            + np.trace(sigma2) - 2 * tr_covmean)
+def extract_embeddings(directory_path, channels, samplingrate, content_type, openl3_hop_size, batch_size=16):
+    """
+    Given a list of files, compute their embeddings in batches.
+    If channels == 1: stereo audio is downmixed to mono. Mono embeddings are of dim=512.
+    If channels == 2: mono audio is "faked" to stereo by copying the mono channel.
+    Stereo embeddings are of dim=1024, since we concatenate L (dim=512) and R (dim=512) embeddings.
+    Params:
+    -- directory_path: path where the generated audio files are available.
+    -- channels: 1 (mono), or 2 (stereo) to get mono or stereo embeddings.
+    -- samplingrate: max bandwidth at which we evaluate the given signals. Up to 48kHz.
+    -- content_type: 'music' or 'env' to select a content type specific openl3 model.
+    -- openl3_hop_size: analysis resolution of openl3 in seconds. Openl3's input window is 1 sec.
+    -- batch_size: number of audio files to process in each batch.
+    Returns:
+    -- list of embeddings: [np.array[], ...], as expected by calculate_frechet_distance()
+    """
+    _, extension = os.path.splitext(directory_path)
+    if extension.lower() == ".scp":
+        wav_files = []
+        with open(directory_path, "r") as f:
+            for line in f:
+                sec = line.strip().split(" ")
+                wav_files.append(sec[1])
+    else:
+        wav_files = glob.glob(directory_path)
+    if len(wav_files) == 0:
+        raise ValueError('No files with this extension in this path!')
+    model = openl3.models.load_audio_embedding_model(input_repr="mel256", content_type=content_type, embedding_size=512)
+    first = True
+    for i in tqdm(range(0, len(wav_files), batch_size)):
+        batch_files = wav_files[i:i+batch_size]
+        batch_audio_l = []
+        batch_audio_r = []
+        batch_sr = []
+        for file in batch_files:
+            audio, sr = librosa.load(file, sr=None, mono=False)
+            audio = audio.T
+            audio = pyln.normalize.peak(audio, -1.0)
+            if audio.shape[0] < sr:
+                print('Audio shorter than 1 sec, openl3 will zero-pad it:', file, audio.shape, sr)
+            # resample to the desired evaluation bandwidth
+            audio = soxr.resample(audio, sr, samplingrate) # mono/stereo <- mono/stereo, input sr, output sr
+            # mono embeddings are stored in batch_audio_l (R channel not used)
+            if channels == 1:
+                batch_audio_l.append(audio)
+            elif channels == 2:
+                if audio.ndim == 1:
+                    # if mono, "fake" stereo by copying mono channel to L and R
+                    batch_audio_l.append(audio)
+                    batch_audio_r.append(audio)
+                elif audio.ndim == 2:
+                    # if it's stereo separate channels for openl3
+                    batch_audio_l.append(audio[:,0])
+                    batch_audio_r.append(audio[:,1])
+            batch_sr.append(samplingrate)
+        # extracting mono embeddings (dim=512) or the L channel for stereo embeddings
+        emb, _ = openl3.get_audio_embedding(batch_audio_l, batch_sr, model=model, verbose=False, hop_size=openl3_hop_size, batch_size=batch_size)
+        # format mono embedding
+        if channels == 1:
+            emb = np.concatenate(emb,axis=0)
+        # extracting stereo embeddings (dim=1024), since we concatenate L (dim=512) and R (dim=512) embeddings
+        elif channels == 2:
+            # extract the missing R channel
+            emb_r, _ = openl3.get_audio_embedding(batch_audio_r, batch_sr, model=model, verbose=False, hop_size=openl3_hop_size, batch_size=batch_size)
+            emb = [np.concatenate([l, r], axis=1) for l, r in zip(emb, emb_r)]
+            emb = np.concatenate(emb, axis=0)
+        # concatenate embeddings
+        if first:
+            embeddings = emb
+            first = False
+        else:
+            embeddings = np.concatenate([embeddings, emb], axis=0)
+    # return as a list of embeddings: [np.array[], ...]
+    return [e for e in embeddings]
+def extract_embeddings_nobatching(directory_path, channels, samplingrate, content_type, openl3_hop_size):
+    """
+    Given a list of files, compute their embeddings one by one.
+    If channels == 1: stereo audio is downmixed to mono. Mono embeddings are of dim=512.
+    If channels == 2: mono audio is "faked" to stereo by copying the mono channel.
+    Stereo embeddings are of dim=1024, since we concatenate L (dim=512) and R (dim=512) embeddings.
+    Params:
+    -- directory_path: path where the generated audio files are available.
+    -- channels: 1 (mono), or 2 (stereo) to get mono or stereo embeddings.
+    -- samplingrate: max bandwidth at which we evaluate the given signals. Up to 48kHz.
+    -- content_type: 'music' or 'env' to select a content type specific openl3 model.
+    -- openl3_hop_size: analysis resolution of openl3 in seconds. Openl3's input window is 1 sec.
+    Returns:
+    -- list of embeddings: [np.array[], ...], as expected by calculate_frechet_distance()
+    """
+    _, extension = os.path.splitext(directory_path)
+    if extension.lower() == ".scp":
+        wav_files = []
+        with open(directory_path, "r") as f:
+            for line in f:
+                sec = line.strip().split(" ")
+                wav_files.append(sec[1])
+    else:
+        wav_files = glob.glob(directory_path)
+    if len(wav_files) == 0:
+        raise ValueError('No files with this extension in this path!')
+    model = openl3.models.load_audio_embedding_model(input_repr="mel256", content_type=content_type, embedding_size=512)
+    first = True
+    for file in tqdm(wav_files):
+        audio, sr = librosa.load(file, sr=None)
+        audio = pyln.normalize.peak(audio, -1.0)
+        if audio.shape[0] < sr:
+            print('Audio shorter than 1 sec, openl3 will zero-pad it:', file, audio.shape, sr)
+        # resample to the desired evaluation bandwidth
+        audio = soxr.resample(audio, sr, samplingrate) # mono/stereo <- mono/stereo, input sr, output sr
+        # extracting stereo embeddings (dim=1024), since we concatenate L (dim=512) and R (dim=512) embeddings
+        if channels == 2:
+            if audio.ndim == 1:
+                audio_l3, sr_l3 = audio, samplingrate
+            elif audio.ndim == 2:
+                # if it's stereo separate channels for openl3
+                audio_l3 = [audio[:,0], audio[:,1]]
+                sr_l3 = [samplingrate, samplingrate]
+            emb, _ = openl3.get_audio_embedding(audio_l3, sr_l3, model=model, verbose=False, hop_size=openl3_hop_size)
+            if audio.ndim == 1:
+                # if mono audio, "fake" stereo by concatenating mono embedding as L and R embeddings
+                emb = np.concatenate([emb, emb],axis=1)
+            elif audio.ndim == 2:
+                emb = np.concatenate(emb,axis=1)
+        # or extracting mono embeddings (dim=512)
+        elif channels == 1:
+            emb, _ = openl3.get_audio_embedding(audio, samplingrate, model=model, verbose=False, hop_size=openl3_hop_size)
+        # concatenate embeddings
+        if first:
+            embeddings = emb
+            first = False
+        else:
+            embeddings = np.concatenate([embeddings, emb], axis=0)
+    # return as a list of embeddings: [np.array[], ...]
+    return [e for e in embeddings]
+def openl3_fd(channels, samplingrate, content_type, openl3_hop_size, eval_path,
+              eval_files_extension='.wav', ref_path=None, ref_files_extension='.wav', load_ref_embeddings=None, batching=False):
+    """
+    Compute the Fréchet Distance between files in eval_path and ref_path.
+    Fréchet distance computed on top of openl3 embeddings.
+    GPU-based computation.
+    Extracting the embeddings is timeconsuming. After being computed once, we store them.
+    We store pre-computed reference embedding statistics in load/openl3_fd/
+    To load those and save computation, just set the path in load_ref_embeddings.
+    If load_ref_embeddings is set, ref_path is not required.
+    Params:
+    -- channels: 1 (mono), or 2 (stereo) to get the Fréchet Distance over mono or stereo embeddings.
+    -- samplingrate: max bandwith at wich we evaluate the given signals. Up to 48kHz.
+    -- content_type: 'music' or 'env' to select a content type for openl3.
+    -- openl3_hop_size: analysis resolution of openl3 in seconds. Openl3's input window is 1 sec.
+    -- eval_path: path where the generated audio files to evaluate are available.
+    -- eval_files_extenstion: files extension (default .wav) in eval_path.
+    -- ref_path: path where the reference audio files are available. (instead of load_ref_embeddings)
+    -- ref_files_extension: files extension (default .wav) in ref_path.
+    -- load_ref_embeddings: path to the reference embedding statistics. (inestead of ref_path)
+    -- batching: set batch size (with an int) or set to False (default False).
+    Returns:
+    -- Fréchet distance.
+    """
+    if not os.path.isdir(eval_path):
+        raise ValueError('eval_path does not exist')
+    if load_ref_embeddings:
+        if not os.path.exists(load_ref_embeddings):
+            raise ValueError('load_ref_embeddings does not exist')
+        print('[LOADING REFERENCE EMBEDDINGS] ', load_ref_embeddings)
+        loaded = np.load(load_ref_embeddings)
+        mu_ref = loaded['mu_ref']
+        sigma_ref = loaded['sigma_ref']
+    else:
+        if ref_path:
+            if not os.path.isdir(ref_path):
+                if not os.path.isfile(ref_path):
+                    raise ValueError("ref_path does not exist")
+            if os.path.isfile(ref_path):
+                path = ref_path
+            else:
+                path = os.path.join(ref_path, '*'+ref_files_extension)
+            print('[EXTRACTING REFERENCE EMBEDDINGS] ', path)
+            if batching:
+                ref_embeddings = extract_embeddings(path, channels, samplingrate, content_type, openl3_hop_size, batch_size=batching)
+            else:
+                ref_embeddings = extract_embeddings_nobatching(path, channels, samplingrate, content_type, openl3_hop_size)
+            mu_ref, sigma_ref = calculate_embd_statistics(ref_embeddings)
+            # store statistics to load later on
+            if not os.path.exists('load/openl3_fd'):
+                os.makedirs('load/openl3_fd/')
+            save_ref_embeddings_path = (
+                'load/openl3_fd/' +
+                path.replace('/', '_') +
+                '__channels' + str(channels) +
+                '__' + str(samplingrate) +
+                '__openl3' + str(content_type) +
+                '__openl3hopsize' + str(openl3_hop_size) +
+                '__batch' + str(batching) +
+                '.npz'
+            )
+            np.savez(save_ref_embeddings_path, mu_ref=mu_ref, sigma_ref=sigma_ref)
+            print('[REFERENCE EMBEDDINGS][SAVED] ', save_ref_embeddings_path)
+        else:
+            raise ValueError('Must specify ref_path or load_ref_embeddings')
+    path = os.path.join(eval_path, '*'+eval_files_extension)
+    print('[EXTRACTING EVALUATION EMBEDDINGS] ', path)
+    if batching:
+        eval_embeddings = extract_embeddings(path, channels, samplingrate, content_type, openl3_hop_size, batch_size=batching)
+    else:
+        eval_embeddings = extract_embeddings_nobatching(path, channels, samplingrate, content_type, openl3_hop_size)
+    mu_eval, sigma_eval = calculate_embd_statistics(eval_embeddings)
+    fd = calculate_frechet_distance(mu_eval, sigma_eval, mu_ref, sigma_ref)
+    if load_ref_embeddings:
+        print('[FRéCHET DISTANCE] ', eval_path, load_ref_embeddings, fd)
+    else:
+        print('[FRéCHET DISTANCE] ', eval_path, ref_path, fd)
+    return fd

inspiremusic/metrics/passt_kld.py ADDED Viewed

	@@ -0,0 +1,232 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
+import os
+import contextlib
+from functools import partial
+from tqdm import tqdm
+import pickle
+import numpy as np
+import librosa
+from hear21passt.base import get_basic_model
+import pyloudnorm as pyln
+import torch
+import torch.nn.functional as F
+SAMPLING_RATE = 32000
+class _patch_passt_stft:
+    """
+    From version 1.8.0, return_complex must always be given explicitly
+    for real inputs and return_complex=False has been deprecated.
+    Decorator to patch torch.stft in PaSST that uses an old stft version.
+    Adapted from: https://github.com/facebookresearch/audiocraft/blob/a2b96756956846e194c9255d0cdadc2b47c93f1b/audiocraft/metrics/kld.py
+    """
+    def __init__(self):
+        self.old_stft = torch.stft
+    def __enter__(self):
+        # return_complex is a mandatory parameter in latest torch versions.
+        # torch is throwing RuntimeErrors when not set.
+        # see: https://pytorch.org/docs/1.7.1/generated/torch.stft.html?highlight=stft#torch.stft
+        # see: https://github.com/kkoutini/passt_hear21/commit/dce83183674e559162b49924d666c0a916dc967a
+        torch.stft = partial(torch.stft, return_complex=False)
+    def __exit__(self, *exc):
+        torch.stft = self.old_stft
+def return_probabilities(model, audio_path, window_size=10, overlap=5, collect='mean'):
+    """
+    Given an audio and the PaSST model, return the probabilities of each AudioSet class.
+    Audio is converted to mono at 32kHz.
+    PaSST model is trained with 10 sec inputs. We refer to this parameter as the window_size.
+    We set it to 10 sec for consistency with PaSST training.
+    For longer audios, we split audio into overlapping analysis windows of window_size and overlap of 10 and 5 seconds.
+    PaSST supports 10, 20 or 30 sec inputs. Not longer inputs: https://github.com/kkoutini/PaSST/issues/19
+    Note that AudioSet taggers normally use sigmoid output layers. Yet, to compute the
+    KL we work with normalized probabilities by running a softmax over logits as in MusicGen:
+    https://github.com/facebookresearch/audiocraft/blob/a2b96756956846e194c9255d0cdadc2b47c93f1b/audiocraft/metrics/kld.py
+    This implementation assumes run will be on GPU.
+    Params:
+    -- model: PaSST model on a GPU.
+    -- audio_path: path to the audio to be loaded with librosa.
+    -- window_size (default=10 sec): analysis window (and receptive field) of PaSST.
+    -- overlap (default=5 sec): overlap of the running analysis window for inputs longar than window_size (10 sec).
+    -- collect (default='mean'): for longer inputs, aggregate/collect via 'mean' or 'max' pooling along logits vector.
+    Returns:
+    --  527 probabilities (after softmax, no logarithm).
+    """
+    # load the audio using librosa
+    audio, _ = librosa.load(audio_path, sr=SAMPLING_RATE, mono=True)
+    audio = pyln.normalize.peak(audio, -1.0)
+    # calculate the step size for the analysis windows with the specified overlap
+    step_size = int((window_size - overlap) * SAMPLING_RATE)
+    # iterate over the audio, creating analysis windows
+    probabilities = []
+    for i in range(0, max(step_size, len(audio) - step_size), step_size):
+        # extract the current analysis window
+        window = audio[i:i + int(window_size * SAMPLING_RATE)]
+        # pad the window with zeros if it's shorter than the desired window size
+        if len(window) < int(window_size * SAMPLING_RATE):
+            # discard window if it's too small (avoid mostly zeros predicted as silence), as in MusicGen:
+            # https://github.com/facebookresearch/audiocraft/blob/a2b96756956846e194c9255d0cdadc2b47c93f1b/audiocraft/metrics/kld.py
+            if len(window) > int(window_size * SAMPLING_RATE * 0.15):
+                tmp = np.zeros(int(window_size * SAMPLING_RATE))
+                tmp[:len(window)] = window
+                window = tmp
+        # convert to a PyTorch tensor and move to GPU
+        audio_wave = torch.from_numpy(window.astype(np.float32)).unsqueeze(0).cuda()
+        # get the probabilities for this analysis window
+        with open(os.devnull, 'w') as f, contextlib.redirect_stdout(f):
+            with torch.no_grad(), _patch_passt_stft():
+                logits = model(audio_wave)
+                probabilities.append(torch.squeeze(logits))
+    probabilities = torch.stack(probabilities)
+    if collect == 'mean':
+        probabilities = torch.mean(probabilities, dim=0)
+    elif collect == 'max':
+        probabilities, _ = torch.max(probabilities, dim=0)
+    return F.softmax(probabilities, dim=0).squeeze().cpu()
+def passt_kld(ids, eval_path, eval_files_extension='.wav', ref_path=None, ref_files_extension='.wav', load_ref_probabilities=None, no_ids=[], collect='mean'):
+    """
+    Compute KL-divergence between the label probabilities of the generated audio with respect to the original audio.
+    Both generated audio (in eval_path) and original audio (in ref_path) are represented by the same prompt/description.
+    Audios are identified by an id, that is the name of the file in both directories and links the audio with the prompt/description.
+    segmenting the audio
+    For inputs longer that the 10 sec PaSST was trained on, we aggregate/collect via 'mean' (default) or 'max' pooling along the logits vector.
+    We split the inpot into overlapping analysis windows. Subsequently, we aggregate/collect (accross windows) the generated logits and then apply a softmax.
+    This evaluation script assumes that ids are in both ref_path and eval_path.
+    We label probabilities via the PaSST model: https://github.com/kkoutini/PaSST
+    GPU-based computation.
+    Extracting the probabilities is timeconsuming. After being computed once, we store them.
+    We store pre-computed reference probabilities in load/
+    To load those and save computation, just set the path in load_ref_probabilities.
+    If load_ref_probabilities is set, ref_path is not required.
+    Params:
+    -- ids: list of ids present in both eval_path and ref_path.
+    -- eval_path: path where the generated audio files to evaluate are available.
+    -- eval_files_extenstion: files extension (default .wav) in eval_path.
+    -- ref_path: path where the reference audio files are available. (instead of load_ref_probabilities)
+    -- ref_files_extenstion: files extension (default .wav) in ref_path.
+    -- load_ref_probabilities: path to the reference probabilities. (inestead of ref_path)
+    -- no_ids: it is possible that some reference audio is corrupted or not present. Ignore some this list of ids.
+    -- collect (default='mean'): for longer inputs, aggregate/collect via 'mean' or 'max' pooling along the logits vector.
+    Returns:
+    -- KL divergence
+    """
+    with open(os.devnull, 'w') as f, contextlib.redirect_stdout(f): # capturing all useless outputs from passt
+        # load model
+        model = get_basic_model(mode="logits")
+        model.eval()
+        model = model.cuda()
+    if not os.path.isdir(eval_path):
+        if not os.path.isfile(eval_path):
+            raise ValueError('eval_path does not exist')
+    if load_ref_probabilities:
+        if not os.path.exists(load_ref_probabilities):
+            raise ValueError('load_ref_probabilities does not exist')
+        print('[LOADING REFERENCE PROBABILITIES] ', load_ref_probabilities)
+        with open(load_ref_probabilities, 'rb') as fp:
+            ref_p = pickle.load(fp)
+    else:
+        if ref_path:
+            if not os.path.isdir(ref_path):
+                if os.path.isfile(ref_path):
+                    id2utt = {}
+                    with open(ref_path, "r") as f:
+                        for line in f:
+                            sec = line.strip().split(" ")
+                            id2utt[sec[0]] = sec[1]
+                    f.close()
+                else:
+                    raise ValueError("ref_path does not exist")
+            print('[EXTRACTING REFERENCE PROBABILITIES] ', ref_path)
+            ref_p = {}
+            for id in tqdm(ids):
+                if id not in no_ids:
+                    try:
+                        if os.path.isfile(ref_path):
+                            if id in id2utt.keys():
+                                audio_path = id2utt[id]
+                            else:
+                                raise ValueError(f"id: {id} not in {ref_path}!")
+                        else:
+                            audio_path = os.path.join(ref_path, str(id)+ref_files_extension)
+                        if os.path.isfile(audio_path):
+                            ref_p[id] = return_probabilities(model, audio_path, collect=collect)
+                    except Exception as e:
+                        print(f"An unexpected error occurred with {id}: {e}\nIf you failed to download it you can add it to no_ids list.")
+            # store reference probabilities to load later on
+            if not os.path.exists('load/passt_kld/'):
+                os.makedirs('load/passt_kld/')
+            save_ref_probabilities_path = 'load/passt_kld/'+ref_path.replace('/', '_')+'_collect'+str(collect)+'__reference_probabilities.pkl'
+            with open(save_ref_probabilities_path, 'wb') as fp:
+                pickle.dump(ref_p, fp)
+            print('[REFERENCE EMBEDDINGS][SAVED] ', save_ref_probabilities_path)
+        else:
+            raise ValueError('Must specify ref_path or load_ref_probabilities')
+    print('[EVALUATING GENERATIONS] ', eval_path)
+    passt_kl = 0
+    count = 0
+    for id in tqdm(ids):
+        if id not in no_ids:
+            try:
+                audio_path = os.path.join(eval_path, str(id)+eval_files_extension)
+                if os.path.isfile(audio_path):
+                    eval_p = return_probabilities(model, audio_path, collect=collect)
+                    # note: F.kl_div(x, y) is KL(y||x)
+                    # see: https://github.com/pytorch/pytorch/issues/7337
+                    # see: https://discuss.pytorch.org/t/kl-divergence-different-results-from-tf/56903/2
+                    passt_kl += F.kl_div((ref_p[id] + 1e-6).log(), eval_p, reduction='sum', log_target=False)
+                    count += 1
+            except Exception as e:
+                print(f"An unexpected error occurred with {id}: {e}\nIf you failed to download it you can add it to no_ids list.")
+    return passt_kl / count if count > 0 else 0

inspiremusic/music_tokenizer/__init__.py ADDED Viewed

File without changes

inspiremusic/music_tokenizer/env.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import shutil
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+def build_env(config, config_name, path):
+    t_path = os.path.join(path, config_name)
+    if config != t_path:
+        os.makedirs(path, exist_ok=True)
+        shutil.copyfile(config, os.path.join(path, config_name))

inspiremusic/music_tokenizer/meldataset.py ADDED Viewed

	@@ -0,0 +1,226 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# code based on https://github.com/b04901014/MQTTS
+import math
+import os
+import random
+import librosa
+import numpy as np
+import torch.utils.data
+from librosa.filters import mel as librosa_mel_fn
+def load_wav(full_path, sr):
+    wav, sr = librosa.load(full_path, sr=sr)
+    return wav, sr
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
+def dynamic_range_decompression(x, C=1):
+    return np.exp(x) / C
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression_torch(x, C=1):
+    return torch.exp(x) / C
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+mel_basis = {}
+hann_window = {}
+## modified to get stft with return complex value = True for pytorch ver2.0
+def mel_spectrogram(y,
+                    n_fft,
+                    num_mels,
+                    sampling_rate,
+                    hop_size,
+                    win_size,
+                    fmin,
+                    fmax,
+                    center=False):
+    global mel_basis, hann_window
+    if fmax not in mel_basis:
+        mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
+        mel_basis[str(fmax) + '_' +
+                  str(y.device)] = torch.from_numpy(mel).float().to(y.device)
+        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1), (int((n_fft - hop_size) / 2), int(
+            (n_fft - hop_size) / 2)),
+        mode='reflect')
+    y = y.squeeze(1)
+    spec = torch.view_as_real(torch.stft(
+        y,
+        n_fft,
+        hop_length=hop_size,
+        win_length=win_size,
+        window=hann_window[str(y.device)],
+        center=center,
+        pad_mode='reflect',
+        normalized=False,
+        onesided=True,
+        return_complex=True
+    ))
+    spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
+    spec = torch.matmul(mel_basis[str(fmax) + '_' + str(y.device)], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec
+def get_dataset_filelist(a):
+    with open(a.input_training_file, 'r') as f:
+        training_files = [l.strip() for l in f]
+    with open(a.input_validation_file, 'r') as f:
+        validation_files = [l.strip() for l in f]
+    return training_files, validation_files
+class MelDataset(torch.utils.data.Dataset):
+    def __init__(self,
+                 training_files,
+                 segment_size,
+                 n_fft,
+                 num_mels,
+                 hop_size,
+                 win_size,
+                 sampling_rate,
+                 fmin,
+                 fmax,
+                 split=True,
+                 shuffle=True,
+                 n_cache_reuse=1,
+                 device=None,
+                 fmax_loss=None,
+                 fine_tuning=False,
+                 base_mels_path=None):
+        self.audio_files = training_files
+        random.seed(1234)
+        if shuffle:
+            random.shuffle(self.audio_files)
+        self.segment_size = segment_size
+        self.sampling_rate = sampling_rate
+        self.split = split
+        self.n_fft = n_fft
+        self.num_mels = num_mels
+        self.hop_size = hop_size
+        self.win_size = win_size
+        self.fmin = fmin
+        self.fmax = fmax
+        self.fmax_loss = fmax_loss
+        self.cached_wav = None
+        self.n_cache_reuse = n_cache_reuse
+        self._cache_ref_count = 0
+        self.device = device
+        self.fine_tuning = fine_tuning
+        self.base_mels_path = base_mels_path
+    def __getitem__(self, index):
+        filename = self.audio_files[index]
+        if self._cache_ref_count == 0:
+            try:
+                # Note by yuantian: load with the sample_rate of config
+                audio, sampling_rate = load_wav(filename, sr=self.sampling_rate)
+            except Exception as e:
+                print(f"Error on audio: {filename}")
+                audio = np.random.normal(size=(160000, )) * 0.05
+                sampling_rate = self.sampling_rate
+            self.cached_wav = audio
+            if sampling_rate != self.sampling_rate:
+                raise ValueError("{} SR doesn't match target {} SR".format(
+                    sampling_rate, self.sampling_rate))
+            self._cache_ref_count = self.n_cache_reuse
+        else:
+            audio = self.cached_wav
+            self._cache_ref_count -= 1
+        audio = torch.FloatTensor(audio)
+        audio = audio.unsqueeze(0)
+        if not self.fine_tuning:
+            if self.split:
+                if audio.size(1) >= self.segment_size:
+                    max_audio_start = audio.size(1) - self.segment_size
+                    audio_start = random.randint(0, max_audio_start)
+                    audio = audio[:, audio_start:audio_start +
+                                  self.segment_size]
+                else:
+                    audio = torch.nn.functional.pad(audio, (
+                        0, self.segment_size - audio.size(1)), 'constant')
+            mel = mel_spectrogram(
+                audio,
+                self.n_fft,
+                self.num_mels,
+                self.sampling_rate,
+                self.hop_size,
+                self.win_size,
+                self.fmin,
+                self.fmax,
+                center=False)
+        else:
+            mel = np.load(
+                os.path.join(self.base_mels_path,
+                             os.path.splitext(os.path.split(filename)[-1])[0] +
+                             '.npy'))
+            mel = torch.from_numpy(mel)
+            if len(mel.shape) < 3:
+                mel = mel.unsqueeze(0)
+            if self.split:
+                frames_per_seg = math.ceil(self.segment_size / self.hop_size)
+                if audio.size(1) >= self.segment_size:
+                    mel_start = random.randint(0,
+                                               mel.size(2) - frames_per_seg - 1)
+                    mel = mel[:, :, mel_start:mel_start + frames_per_seg]
+                    audio = audio[:, mel_start * self.hop_size:(
+                        mel_start + frames_per_seg) * self.hop_size]
+                else:
+                    mel = torch.nn.functional.pad(mel, (
+                        0, frames_per_seg - mel.size(2)), 'constant')
+                    audio = torch.nn.functional.pad(audio, (
+                        0, self.segment_size - audio.size(1)), 'constant')
+        mel_loss = mel_spectrogram(
+            audio,
+            self.n_fft,
+            self.num_mels,
+            self.sampling_rate,
+            self.hop_size,
+            self.win_size,
+            self.fmin,
+            self.fmax_loss,
+            center=False)
+        return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
+    def __len__(self):
+        return len(self.audio_files)

inspiremusic/music_tokenizer/models.py ADDED Viewed

	@@ -0,0 +1,548 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import AvgPool1d
+from torch.nn import Conv1d
+from torch.nn import Conv2d
+from torch.nn import ConvTranspose1d
+from torch.nn.utils import remove_weight_norm
+from torch.nn.utils import spectral_norm
+from torch.nn.utils import weight_norm
+from inspiremusic.utils.tokenizer_utils import get_padding
+from inspiremusic.utils.tokenizer_utils import init_weights
+LRELU_SLOPE = 0.1
+class ResBlock1(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.h = h
+        self.convs1 = nn.ModuleList([
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[0],
+                    padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[1],
+                    padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[2],
+                    padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList([
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=1,
+                    padding=get_padding(kernel_size, 1))), weight_norm(
+                        Conv1d(
+                            channels,
+                            channels,
+                            kernel_size,
+                            1,
+                            dilation=1,
+                            padding=get_padding(kernel_size, 1))), weight_norm(
+                                Conv1d(
+                                    channels,
+                                    channels,
+                                    kernel_size,
+                                    1,
+                                    dilation=1,
+                                    padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+class ResBlock2(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.h = h
+        self.convs = nn.ModuleList([
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[0],
+                    padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[1],
+                    padding=get_padding(kernel_size, dilation[1])))
+        ])
+        self.convs.apply(init_weights)
+    def forward(self, x):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+class Generator(torch.nn.Module):
+    def __init__(self, h):
+        super(Generator, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        self.conv_pre = weight_norm(
+            Conv1d(512, h.upsample_initial_channel, 7, 1, padding=3))
+        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u,
+                k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        h.upsample_initial_channel // (2**i),
+                        h.upsample_initial_channel // (2**(i + 1)),
+                        k,
+                        u,
+                        # padding=(u//2 + u%2),
+                        padding=(k - u) // 2,
+                        # output_padding=u%2
+                    )))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel // (2**(i + 1))
+            for j, (k, d) in enumerate(
+                    zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
+                self.resblocks.append(resblock(h, ch, k, d))
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+    def forward(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x, LRELU_SLOPE)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3,
+                 use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(
+                Conv2d(
+                    1,
+                    32, (kernel_size, 1), (stride, 1),
+                    padding=(get_padding(5, 1), 0))),
+            norm_f(
+                Conv2d(
+                    32,
+                    128, (kernel_size, 1), (stride, 1),
+                    padding=(get_padding(5, 1), 0))),
+            norm_f(
+                Conv2d(
+                    128,
+                    512, (kernel_size, 1), (stride, 1),
+                    padding=(get_padding(5, 1), 0))),
+            norm_f(
+                Conv2d(
+                    512,
+                    1024, (kernel_size, 1), (stride, 1),
+                    padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
+        ])
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiPeriodDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList([
+            DiscriminatorP(2),
+            DiscriminatorP(3),
+            DiscriminatorP(5),
+            DiscriminatorP(7),
+            DiscriminatorP(11),
+        ])
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv1d(1, 128, 15, 1, padding=7)),
+            norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
+            norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
+            norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
+            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+        ])
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiScaleDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiScaleDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList([
+            DiscriminatorS(use_spectral_norm=True),
+            DiscriminatorS(),
+            DiscriminatorS(),
+        ])
+        self.meanpools = nn.ModuleList(
+            [AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)])
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            if i != 0:
+                y = self.meanpools[i - 1](y)
+                y_hat = self.meanpools[i - 1](y_hat)
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+def feature_loss(fmap_r, fmap_g):
+    loss = 0
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            loss += torch.mean(torch.abs(rl - gl))
+    return loss * 2
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    loss = 0
+    r_losses = []
+    g_losses = []
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        r_loss = torch.mean((1 - dr)**2)
+        g_loss = torch.mean(dg**2)
+        loss += (r_loss + g_loss)
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+    return loss, r_losses, g_losses
+def generator_loss(disc_outputs):
+    loss = 0
+    gen_losses = []
+    for dg in disc_outputs:
+        l = torch.mean((1 - dg)**2)
+        gen_losses.append(l)
+        loss += l
+    return loss, gen_losses
+class Encoder(torch.nn.Module):
+    def __init__(self, h):
+        super(Encoder, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        self.conv_pre = weight_norm(Conv1d(1, 32, 7, 1, padding=3))
+        self.normalize = nn.ModuleList()
+        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(
+                list(
+                    reversed(
+                        list(zip(h.upsample_rates, h.upsample_kernel_sizes))))):
+            self.ups.append(
+                weight_norm(
+                    Conv1d(
+                        32 * (2**i),
+                        32 * (2**(i + 1)),
+                        k,
+                        u,
+                        padding=((k - u) // 2)
+                        # padding=(u//2 + u%2)
+                    )))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = 32 * (2**(i + 1))
+            for j, (k, d) in enumerate(
+                    zip(
+                        list(reversed(h.resblock_kernel_sizes)),
+                        list(reversed(h.resblock_dilation_sizes)))):
+                self.resblocks.append(resblock(h, ch, k, d))
+                self.normalize.append(
+                    torch.nn.GroupNorm(ch // 16, ch, eps=1e-6, affine=True))
+        self.conv_post = Conv1d(512, 512, 3, 1, padding=1)
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+    def forward(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                    xs = self.normalize[i * self.num_kernels + j](xs)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+                    xs = self.normalize[i * self.num_kernels + j](xs)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        return x
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+class Quantizer_module(torch.nn.Module):
+    def __init__(self, n_e, e_dim):
+        super(Quantizer_module, self).__init__()
+        self.embedding = nn.Embedding(n_e, e_dim)
+        self.embedding.weight.data.uniform_(-1.0 / n_e, 1.0 / n_e)
+    def forward(self, x):
+        # compute Euclidean distance
+        d = torch.sum(x ** 2, 1, keepdim=True) + torch.sum(self.embedding.weight ** 2, 1) \
+            - 2 * torch.matmul(x, self.embedding.weight.T)
+        min_indicies = torch.argmin(d, 1)
+        z_q = self.embedding(min_indicies)
+        return z_q, min_indicies
+class Quantizer(torch.nn.Module):
+    def __init__(self, h):
+        super(Quantizer, self).__init__()
+        assert 512 % h.n_code_groups == 0
+        self.quantizer_modules = nn.ModuleList([
+            Quantizer_module(h.n_codes, 512 // h.n_code_groups)
+            for _ in range(h.n_code_groups)
+        ])
+        self.quantizer_modules2 = nn.ModuleList([
+            Quantizer_module(h.n_codes, 512 // h.n_code_groups)
+            for _ in range(h.n_code_groups)
+        ])
+        self.h = h
+        self.codebook_loss_lambda = self.h.codebook_loss_lambda  # e.g., 1
+        self.commitment_loss_lambda = self.h.commitment_loss_lambda  # e.g., 0.25
+        self.residul_layer = 2
+        self.n_code_groups = h.n_code_groups
+    def for_one_step(self, xin, idx):
+        xin = xin.transpose(1, 2)
+        x = xin.reshape(-1, 512)
+        x = torch.split(x, 512 // self.h.n_code_groups, dim=-1)
+        min_indicies = []
+        z_q = []
+        if idx == 0:
+            for _x, m in zip(x, self.quantizer_modules):
+                _z_q, _min_indicies = m(_x)
+                z_q.append(_z_q)
+                min_indicies.append(_min_indicies)  #B * T,
+            z_q = torch.cat(z_q, -1).reshape(xin.shape)
+            # loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean((z_q - xin.detach()) ** 2)
+            loss = self.codebook_loss_lambda * torch.mean((z_q - xin.detach()) ** 2) \
+                + self.commitment_loss_lambda * torch.mean((z_q.detach() - xin) ** 2)
+            z_q = xin + (z_q - xin).detach()
+            z_q = z_q.transpose(1, 2)
+            return z_q, loss, min_indicies
+        else:
+            for _x, m in zip(x, self.quantizer_modules2):
+                _z_q, _min_indicies = m(_x)
+                z_q.append(_z_q)
+                min_indicies.append(_min_indicies)  #B * T,
+            z_q = torch.cat(z_q, -1).reshape(xin.shape)
+            # loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean((z_q - xin.detach()) ** 2)
+            loss = self.codebook_loss_lambda * torch.mean((z_q - xin.detach()) ** 2) \
+                + self.commitment_loss_lambda * torch.mean((z_q.detach() - xin) ** 2)
+            z_q = xin + (z_q - xin).detach()
+            z_q = z_q.transpose(1, 2)
+            return z_q, loss, min_indicies
+    def forward(self, xin):
+        #B, C, T
+        quantized_out = 0.0
+        residual = xin
+        all_losses = []
+        all_indices = []
+        for i in range(self.residul_layer):
+            quantized, loss, indices = self.for_one_step(residual, i)  #
+            residual = residual - quantized
+            quantized_out = quantized_out + quantized
+            all_indices.extend(indices)  #
+            all_losses.append(loss)
+        all_losses = torch.stack(all_losses)
+        loss = torch.mean(all_losses)
+        return quantized_out, loss, all_indices
+    def embed(self, x):
+        #idx: N, T, 4
+        #print('x ', x.shape)
+        quantized_out = torch.tensor(0.0, device=x.device)
+        x = torch.split(x, 1, 2)  # split, 将最后一个维度分开, 每个属于一个index group
+        #print('x.shape ', len(x),x[0].shape)
+        for i in range(self.residul_layer):
+            ret = []
+            if i == 0:
+                for j in range(self.n_code_groups):
+                    q = x[j]
+                    embed = self.quantizer_modules[j]
+                    q = embed.embedding(q.squeeze(-1).long())
+                    ret.append(q)
+                ret = torch.cat(ret, -1)
+                #print(ret.shape)
+                quantized_out = quantized_out + ret
+            else:
+                for j in range(self.n_code_groups):
+                    q = x[j + self.n_code_groups]
+                    embed = self.quantizer_modules2[j]
+                    q = embed.embedding(q.squeeze(-1).long())
+                    ret.append(q)
+                ret = torch.cat(ret, -1)
+                quantized_out = quantized_out + ret
+        return quantized_out.transpose(1, 2)  #N, C, T

inspiremusic/music_tokenizer/vqvae.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import torch
+import torch.nn as nn
+from inspiremusic.music_tokenizer.env import AttrDict
+from inspiremusic.music_tokenizer.models import Encoder
+from inspiremusic.music_tokenizer.models import Generator
+from inspiremusic.music_tokenizer.models import Quantizer
+class VQVAE(nn.Module):
+    def __init__(self,
+                 config_path,
+                 ckpt_path,
+                 with_encoder=False):
+        super(VQVAE, self).__init__()
+        ckpt = torch.load(ckpt_path)
+        with open(config_path) as f:
+            data = f.read()
+        json_config = json.loads(data)
+        self.h = AttrDict(json_config)
+        self.quantizer = Quantizer(self.h)
+        self.generator = Generator(self.h)
+        self.generator.load_state_dict(ckpt['generator'])
+        self.quantizer.load_state_dict(ckpt['quantizer'])
+        if with_encoder:
+            self.encoder = Encoder(self.h)
+            self.encoder.load_state_dict(ckpt['encoder'])
+    def forward(self, x):
+        # x is the codebook
+        # x.shape (B, T, Nq)
+        quant_emb = self.quantizer.embed(x)
+        return self.generator(quant_emb)
+    def encode(self, x):
+        batch_size = x.size(0)
+        if len(x.shape) == 3 and x.shape[-1] == 1:
+            x = x.squeeze(-1)
+        c = self.encoder(x.unsqueeze(1))
+        q, loss_q, c = self.quantizer(c)
+        c = [code.reshape(batch_size, -1) for code in c]
+        # shape: [N, T, 4]
+        return torch.stack(c, -1)

inspiremusic/text/abs_tokenizer.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABC
+from abc import abstractmethod
+from typing import Iterable
+from typing import List
+class AbsTokenizer(ABC):
+    @abstractmethod
+    def text2tokens(self, line: str) -> List[str]:
+        raise NotImplementedError
+    @abstractmethod
+    def tokens2text(self, tokens: Iterable[str]) -> str:
+        raise NotImplementedError
+    def encode(self, line: str, **kwargs) -> List[str]:
+        return self.text2tokens(line)

inspiremusic/text/tokenizer.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import os
+import re
+from typing import Iterable, List, Union
+import numpy as np
+import torch
+from inspiremusic.text.abs_tokenizer import AbsTokenizer
+from transformers import AutoTokenizer
+def get_tokenizer(tokenizer_name, tokenizer_path):
+    if "qwen" in tokenizer_name:
+        return QwenTokenizer(tokenizer_path,skip_special_tokens=True)
+    else:
+        return None
+class QwenTokenizer(AbsTokenizer):
+    def __init__(
+            self,
+            token_path: str,
+            skip_special_tokens: bool = True,
+    ):
+        super().__init__()
+        # NOTE: non-chat model, all these special tokens keep randomly initialized.
+        special_tokens = {
+            'eos_token': '<|endoftext|>',
+            'pad_token': '<|endoftext|>',
+            'additional_special_tokens': [
+                '<|im_start|>', '<|im_end|>', '<|endofprompt|>',
+                '[breath]', '<strong>', '</strong>', '[noise]',
+                '[laughter]', '[cough]', '[clucking]', '[accent]',
+                '[quick_breath]',
+            ]
+        }
+        self.tokenizer = AutoTokenizer.from_pretrained(token_path)
+        self.tokenizer.add_special_tokens(special_tokens)
+        self.skip_special_tokens = skip_special_tokens
+    def get_vocab_size(self):
+        return self.tokenizer.vocab_size
+    def text2tokens(self, line: str) -> List:
+        tokens = self.tokenizer([line], return_tensors="pt")
+        tokens = tokens["input_ids"][0].cpu().tolist()
+        return tokens
+    def tokens2text(self, tokens) -> str:
+        tokens = torch.tensor(tokens, dtype=torch.int64)
+        text = self.tokenizer.batch_decode([tokens], skip_special_tokens=self.skip_special_tokens)[0]
+        return text
+def get_qwen_vocab_size(token_type: str):
+    if "qwen1.5" in token_type.lower() or "qwen2.0" in token_type.lower() or "qwen2.5" in token_type.lower():
+        # 293 for special and extra tokens, including endoftext, im_start, im_end, endofprompt and others in the future.
+        # model.vocab_size = 151936, tokenizer.vocab_size = 151643
+        # NOTE: the first three special tokens (endoftext, im_start, im_end) are trained in Chat series models,
+        # others are kept in random initialization state.
+        return 151643 + 293
+    else:
+        raise ValueError(f"Unknown tokenizer {token_type}")

inspiremusic/transformer/__init__.py ADDED Viewed

File without changes

inspiremusic/transformer/activation.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe)
+#               2020 Northwestern Polytechnical University (Pengcheng Guo)
+#               2020 Mobvoi Inc (Binbin Zhang)
+#               2024 Alibaba Inc (Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Swish() activation function for Conformer."""
+import torch
+from torch import nn, sin, pow
+from torch.nn import Parameter
+class Swish(torch.nn.Module):
+    """Construct an Swish object."""
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Return Swish activation function."""
+        return x * torch.sigmoid(x)
+# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
+#   LICENSE is in incl_licenses directory.
+class Snake(nn.Module):
+    '''
+    Implementation of a sine-based periodic activation function
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter
+    References:
+        - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snake(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    '''
+    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+        '''
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha: trainable parameter
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            alpha will be trained along with the rest of your model.
+        '''
+        super(Snake, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+        else:  # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        '''
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        Snake ∶= x + 1/a * sin^2 (xa)
+        '''
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+        x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+        return x

inspiremusic/transformer/attention.py ADDED Viewed

	@@ -0,0 +1,328 @@

+# Copyright (c) 2019 Shigeki Karita
+#               2020 Mobvoi Inc (Binbin Zhang)
+#               2022 Xingchen Song ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Multi-Head Attention layer definition."""
+import math
+from typing import Tuple
+import torch
+from torch import nn
+class MultiHeadedAttention(nn.Module):
+    """Multi-Head Attention layer.
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self,
+                 n_head: int,
+                 n_feat: int,
+                 dropout_rate: float,
+                 key_bias: bool = True):
+        """Construct an MultiHeadedAttention object."""
+        super().__init__()
+        assert n_feat % n_head == 0
+        # We assume d_v always equals d_k
+        self.d_k = n_feat // n_head
+        self.h = n_head
+        self.linear_q = nn.Linear(n_feat, n_feat)
+        self.linear_k = nn.Linear(n_feat, n_feat, bias=key_bias)
+        self.linear_v = nn.Linear(n_feat, n_feat)
+        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.dropout = nn.Dropout(p=dropout_rate)
+    def forward_qkv(
+        self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Transform query, key and value.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+        Returns:
+            torch.Tensor: Transformed query tensor, size
+                (#batch, n_head, time1, d_k).
+            torch.Tensor: Transformed key tensor, size
+                (#batch, n_head, time2, d_k).
+            torch.Tensor: Transformed value tensor, size
+                (#batch, n_head, time2, d_k).
+        """
+        n_batch = query.size(0)
+        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
+        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
+        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
+        q = q.transpose(1, 2)  # (batch, head, time1, d_k)
+        k = k.transpose(1, 2)  # (batch, head, time2, d_k)
+        v = v.transpose(1, 2)  # (batch, head, time2, d_k)
+        return q, k, v
+    def forward_attention(
+        self,
+        value: torch.Tensor,
+        scores: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool)
+    ) -> torch.Tensor:
+        """Compute attention context vector.
+        Args:
+            value (torch.Tensor): Transformed value, size
+                (#batch, n_head, time2, d_k).
+            scores (torch.Tensor): Attention score, size
+                (#batch, n_head, time1, time2).
+            mask (torch.Tensor): Mask, size (#batch, 1, time2) or
+                (#batch, time1, time2), (0, 0, 0) means fake mask.
+        Returns:
+            torch.Tensor: Transformed value (#batch, time1, d_model)
+                weighted by the attention score (#batch, time1, time2).
+        """
+        n_batch = value.size(0)
+        # NOTE(xcsong): When will `if mask.size(2) > 0` be True?
+        #   1. onnx(16/4) [WHY? Because we feed real cache & real mask for the
+        #           1st chunk to ease the onnx export.]
+        #   2. pytorch training
+        if mask.size(2) > 0:  # time2 > 0
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
+            # For last chunk, time2 might be larger than scores.size(-1)
+            mask = mask[:, :, :, :scores.size(-1)]  # (batch, 1, *, time2)
+            scores = scores.masked_fill(mask, -float('inf'))
+            attn = torch.softmax(scores, dim=-1).masked_fill(
+                mask, 0.0)  # (batch, head, time1, time2)
+        # NOTE(xcsong): When will `if mask.size(2) > 0` be False?
+        #   1. onnx(16/-1, -1/-1, 16/0)
+        #   2. jit (16/-1, -1/-1, 16/0, 16/4)
+        else:
+            attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+        p_attn = self.dropout(attn)
+        x = torch.matmul(p_attn, value)  # (batch, head, time1, d_k)
+        x = (x.transpose(1, 2).contiguous().view(n_batch, -1,
+                                                 self.h * self.d_k)
+             )  # (batch, time1, d_model)
+        return self.linear_out(x)  # (batch, time1, d_model)
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        pos_emb: torch.Tensor = torch.empty(0),
+        cache: torch.Tensor = torch.zeros((0, 0, 0, 0))
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute scaled dot product attention.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+                1.When applying cross attention between decoder and encoder,
+                the batch padding mask for input is in (#batch, 1, T) shape.
+                2.When applying self attention of encoder,
+                the mask is in (#batch, T, T)  shape.
+                3.When applying self attention of decoder,
+                the mask is in (#batch, L, L)  shape.
+                4.If the different position in decoder see different block
+                of the encoder, such as Mocha, the passed in mask could be
+                in (#batch, L, T) shape. But there is no such case in current
+                InspireMusic.
+            cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+            torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        # NOTE(xcsong):
+        #   when export onnx model, for 1st chunk, we feed
+        #       cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
+        #       or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
+        #       In all modes, `if cache.size(0) > 0` will alwayse be `True`
+        #       and we will always do splitting and
+        #       concatnation(this will simplify onnx export). Note that
+        #       it's OK to concat & split zero-shaped tensors(see code below).
+        #   when export jit  model, for 1st chunk, we always feed
+        #       cache(0, 0, 0, 0) since jit supports dynamic if-branch.
+        # >>> a = torch.ones((1, 2, 0, 4))
+        # >>> b = torch.ones((1, 2, 3, 4))
+        # >>> c = torch.cat((a, b), dim=2)
+        # >>> torch.equal(b, c)        # True
+        # >>> d = torch.split(a, 2, dim=-1)
+        # >>> torch.equal(d[0], d[1])  # True
+        if cache.size(0) > 0:
+            key_cache, value_cache = torch.split(cache,
+                                                 cache.size(-1) // 2,
+                                                 dim=-1)
+            k = torch.cat([key_cache, k], dim=2)
+            v = torch.cat([value_cache, v], dim=2)
+        # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
+        #   non-trivial to calculate `next_cache_start` here.
+        new_cache = torch.cat((k, v), dim=-1)
+        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
+        return self.forward_attention(v, scores, mask), new_cache
+class RelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """Multi-Head Attention layer with relative position encoding.
+    Paper: https://arxiv.org/abs/1901.02860
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self,
+                 n_head: int,
+                 n_feat: int,
+                 dropout_rate: float,
+                 key_bias: bool = True):
+        """Construct an RelPositionMultiHeadedAttention object."""
+        super().__init__(n_head, n_feat, dropout_rate, key_bias)
+        # linear transformation for positional encoding
+        self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
+        # these two learnable bias are used in matrix c and matrix d
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
+        torch.nn.init.xavier_uniform_(self.pos_bias_u)
+        torch.nn.init.xavier_uniform_(self.pos_bias_v)
+    def rel_shift(self, x: torch.Tensor) -> torch.Tensor:
+        """Compute relative positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, head, time1, 2*time1-1).
+            time1 means the length of query vector.
+        Returns:
+            torch.Tensor: Output tensor.
+        """
+        zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1),
+                               device=x.device,
+                               dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+        x_padded = x_padded.view(x.size()[0],
+                                 x.size()[1],
+                                 x.size(3) + 1, x.size(2))
+        x = x_padded[:, :, 1:].view_as(x)[
+            :, :, :, : x.size(-1) // 2 + 1
+        ]  # only keep the positions from 0 to time2
+        return x
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        pos_emb: torch.Tensor = torch.empty(0),
+        cache: torch.Tensor = torch.zeros((0, 0, 0, 0))
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2), (0, 0, 0) means fake mask.
+            pos_emb (torch.Tensor): Positional embedding tensor
+                (#batch, time2, size).
+            cache (torch.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+            torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        q = q.transpose(1, 2)  # (batch, time1, head, d_k)
+        # NOTE(xcsong):
+        #   when export onnx model, for 1st chunk, we feed
+        #       cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
+        #       or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
+        #       In all modes, `if cache.size(0) > 0` will alwayse be `True`
+        #       and we will always do splitting and
+        #       concatnation(this will simplify onnx export). Note that
+        #       it's OK to concat & split zero-shaped tensors(see code below).
+        #   when export jit  model, for 1st chunk, we always feed
+        #       cache(0, 0, 0, 0) since jit supports dynamic if-branch.
+        # >>> a = torch.ones((1, 2, 0, 4))
+        # >>> b = torch.ones((1, 2, 3, 4))
+        # >>> c = torch.cat((a, b), dim=2)
+        # >>> torch.equal(b, c)        # True
+        # >>> d = torch.split(a, 2, dim=-1)
+        # >>> torch.equal(d[0], d[1])  # True
+        if cache.size(0) > 0:
+            key_cache, value_cache = torch.split(cache,
+                                                 cache.size(-1) // 2,
+                                                 dim=-1)
+            k = torch.cat([key_cache, k], dim=2)
+            v = torch.cat([value_cache, v], dim=2)
+        # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
+        #   non-trivial to calculate `next_cache_start` here.
+        new_cache = torch.cat((k, v), dim=-1)
+        n_batch_pos = pos_emb.size(0)
+        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
+        p = p.transpose(1, 2)  # (batch, head, time1, d_k)
+        # (batch, head, time1, d_k)
+        q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
+        # (batch, head, time1, d_k)
+        q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
+        # compute attention score
+        # first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # (batch, head, time1, time2)
+        matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
+        # compute matrix b and matrix d
+        # (batch, head, time1, time2)
+        matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
+        # NOTE(Xiang Lyu): Keep rel_shift since espnet rel_pos_emb is used
+        if matrix_ac.shape != matrix_bd.shape:
+            matrix_bd = self.rel_shift(matrix_bd)
+        scores = (matrix_ac + matrix_bd) / math.sqrt(self.d_k)  # (batch, head, time1, time2)
+        return self.forward_attention(v, scores, mask), new_cache

inspiremusic/transformer/convolution.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
+#               2024 Alibaba Inc (Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""ConvolutionModule definition."""
+from typing import Tuple
+import torch
+from torch import nn
+class ConvolutionModule(nn.Module):
+    """ConvolutionModule in Conformer model."""
+    def __init__(self,
+                 channels: int,
+                 kernel_size: int = 15,
+                 activation: nn.Module = nn.ReLU(),
+                 norm: str = "batch_norm",
+                 causal: bool = False,
+                 bias: bool = True):
+        """Construct an ConvolutionModule object.
+        Args:
+            channels (int): The number of channels of conv layers.
+            kernel_size (int): Kernel size of conv layers.
+            causal (int): Whether use causal convolution or not
+        """
+        super().__init__()
+        self.pointwise_conv1 = nn.Conv1d(
+            channels,
+            2 * channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=bias,
+        )
+        # self.lorder is used to distinguish if it's a causal convolution,
+        # if self.lorder > 0: it's a causal convolution, the input will be
+        #    padded with self.lorder frames on the left in forward.
+        # else: it's a symmetrical convolution
+        if causal:
+            padding = 0
+            self.lorder = kernel_size - 1
+        else:
+            # kernel_size should be an odd number for none causal convolution
+            assert (kernel_size - 1) % 2 == 0
+            padding = (kernel_size - 1) // 2
+            self.lorder = 0
+        self.depthwise_conv = nn.Conv1d(
+            channels,
+            channels,
+            kernel_size,
+            stride=1,
+            padding=padding,
+            groups=channels,
+            bias=bias,
+        )
+        assert norm in ['batch_norm', 'layer_norm']
+        if norm == "batch_norm":
+            self.use_layer_norm = False
+            self.norm = nn.BatchNorm1d(channels)
+        else:
+            self.use_layer_norm = True
+            self.norm = nn.LayerNorm(channels)
+        self.pointwise_conv2 = nn.Conv1d(
+            channels,
+            channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=bias,
+        )
+        self.activation = activation
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        cache: torch.Tensor = torch.zeros((0, 0, 0)),
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute convolution module.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, channels).
+            mask_pad (torch.Tensor): used for batch padding (#batch, 1, time),
+                (0, 0, 0) means fake mask.
+            cache (torch.Tensor): left context cache, it is only
+                used in causal convolution (#batch, channels, cache_t),
+                (0, 0, 0) meas fake cache.
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, channels).
+        """
+        # exchange the temporal dimension and the feature dimension
+        x = x.transpose(1, 2)  # (#batch, channels, time)
+        # mask batch padding
+        if mask_pad.size(2) > 0:  # time > 0
+            x.masked_fill_(~mask_pad, 0.0)
+        if self.lorder > 0:
+            if cache.size(2) == 0:  # cache_t == 0
+                x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0)
+            else:
+                assert cache.size(0) == x.size(0)  # equal batch
+                assert cache.size(1) == x.size(1)  # equal channel
+                x = torch.cat((cache, x), dim=2)
+            assert (x.size(2) > self.lorder)
+            new_cache = x[:, :, -self.lorder:]
+        else:
+            # It's better we just return None if no cache is required,
+            # However, for JIT export, here we just fake one tensor instead of
+            # None.
+            new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
+        # GLU mechanism
+        x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
+        x = nn.functional.glu(x, dim=1)  # (batch, channel, dim)
+        # 1D Depthwise Conv
+        x = self.depthwise_conv(x)
+        if self.use_layer_norm:
+            x = x.transpose(1, 2)
+        x = self.activation(self.norm(x))
+        if self.use_layer_norm:
+            x = x.transpose(1, 2)
+        x = self.pointwise_conv2(x)
+        # mask batch padding
+        if mask_pad.size(2) > 0:  # time > 0
+            x.masked_fill_(~mask_pad, 0.0)
+        return x.transpose(1, 2), new_cache

inspiremusic/transformer/decoder.py ADDED Viewed

	@@ -0,0 +1,396 @@

+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
+#               2024 Alibaba Inc (Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Decoder definition."""
+from typing import Tuple, List, Optional
+import torch
+import torch.utils.checkpoint as ckpt
+import logging
+from inspiremusic.transformer.decoder_layer import DecoderLayer
+from inspiremusic.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from inspiremusic.utils.class_utils import (
+    INSPIREMUSIC_EMB_CLASSES,
+    INSPIREMUSIC_ATTENTION_CLASSES,
+    INSPIREMUSIC_ACTIVATION_CLASSES,
+)
+from inspiremusic.utils.mask import (subsequent_mask, make_pad_mask)
+class TransformerDecoder(torch.nn.Module):
+    """Base class of Transfomer decoder module.
+    Args:
+        vocab_size: output dim
+        encoder_output_size: dimension of attention
+        attention_heads: the number of heads of multi head attention
+        linear_units: the hidden units number of position-wise feedforward
+        num_blocks: the number of decoder blocks
+        dropout_rate: dropout rate
+        self_attention_dropout_rate: dropout rate for attention
+        input_layer: input layer type
+        use_output_layer: whether to use output layer
+        pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
+        normalize_before:
+            True: use layer_norm before each sub-block of a layer.
+            False: use layer_norm after each sub-block of a layer.
+        src_attention: if false, encoder-decoder cross attention is not
+                       applied, such as CIF model
+        key_bias: whether use bias in attention.linear_k, False for whisper models.
+        gradient_checkpointing: rerunning a forward-pass segment for each
+            checkpointed segment during backward.
+        tie_word_embedding: Tie or clone module weights depending of whether we are
+            using TorchScript or not
+    """
+    def __init__(
+        self,
+        vocab_size: int,
+        encoder_output_size: int,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        self_attention_dropout_rate: float = 0.0,
+        src_attention_dropout_rate: float = 0.0,
+        input_layer: str = "embed",
+        use_output_layer: bool = True,
+        normalize_before: bool = True,
+        src_attention: bool = True,
+        key_bias: bool = True,
+        activation_type: str = "relu",
+        gradient_checkpointing: bool = False,
+        tie_word_embedding: bool = False,
+    ):
+        super().__init__()
+        attention_dim = encoder_output_size
+        activation = INSPIREMUSIC_ACTIVATION_CLASSES[activation_type]()
+        self.embed = torch.nn.Sequential(
+            torch.nn.Identity() if input_layer == "no_pos" else
+            torch.nn.Embedding(vocab_size, attention_dim),
+            INSPIREMUSIC_EMB_CLASSES[input_layer](attention_dim,
+                                               positional_dropout_rate),
+        )
+        self.normalize_before = normalize_before
+        self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-5)
+        self.use_output_layer = use_output_layer
+        if use_output_layer:
+            self.output_layer = torch.nn.Linear(attention_dim, vocab_size)
+        else:
+            self.output_layer = torch.nn.Identity()
+        self.num_blocks = num_blocks
+        self.decoders = torch.nn.ModuleList([
+            DecoderLayer(
+                attention_dim,
+                INSPIREMUSIC_ATTENTION_CLASSES["selfattn"](
+                    attention_heads, attention_dim,
+                    self_attention_dropout_rate, key_bias),
+                INSPIREMUSIC_ATTENTION_CLASSES["selfattn"](
+                    attention_heads, attention_dim, src_attention_dropout_rate,
+                    key_bias) if src_attention else None,
+                PositionwiseFeedForward(attention_dim, linear_units,
+                                        dropout_rate, activation),
+                dropout_rate,
+                normalize_before,
+            ) for _ in range(self.num_blocks)
+        ])
+        self.gradient_checkpointing = gradient_checkpointing
+        self.tie_word_embedding = tie_word_embedding
+    def forward(
+        self,
+        memory: torch.Tensor,
+        memory_mask: torch.Tensor,
+        ys_in_pad: torch.Tensor,
+        ys_in_lens: torch.Tensor,
+        r_ys_in_pad: torch.Tensor = torch.empty(0),
+        reverse_weight: float = 0.0,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Forward decoder.
+        Args:
+            memory: encoded memory, float32  (batch, maxlen_in, feat)
+            memory_mask: encoder memory mask, (batch, 1, maxlen_in)
+            ys_in_pad: padded input token ids, int64 (batch, maxlen_out)
+            ys_in_lens: input lengths of this batch (batch)
+            r_ys_in_pad: not used in transformer decoder, in order to unify api
+                with bidirectional decoder
+            reverse_weight: not used in transformer decoder, in order to unify
+                api with bidirectional decode
+        Returns:
+            (tuple): tuple containing:
+                x: decoded token score before softmax (batch, maxlen_out,
+                    vocab_size) if use_output_layer is True,
+                torch.tensor(0.0), in order to unify api with bidirectional decoder
+                olens: (batch, )
+        NOTE(xcsong):
+            We pass the `__call__` method of the modules instead of `forward` to the
+            checkpointing API because `__call__` attaches all the hooks of the module.
+            https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
+        """
+        tgt = ys_in_pad
+        maxlen = tgt.size(1)
+        # tgt_mask: (B, 1, L)
+        tgt_mask = ~make_pad_mask(ys_in_lens, maxlen).unsqueeze(1)
+        tgt_mask = tgt_mask.to(tgt.device)
+        # m: (1, L, L)
+        m = subsequent_mask(tgt_mask.size(-1),
+                            device=tgt_mask.device).unsqueeze(0)
+        # tgt_mask: (B, L, L)
+        tgt_mask = tgt_mask & m
+        x, _ = self.embed(tgt)
+        if self.gradient_checkpointing and self.training:
+            x = self.forward_layers_checkpointed(x, tgt_mask, memory,
+                                                 memory_mask)
+        else:
+            x = self.forward_layers(x, tgt_mask, memory, memory_mask)
+        if self.normalize_before:
+            x = self.after_norm(x)
+        if self.use_output_layer:
+            x = self.output_layer(x)
+        olens = tgt_mask.sum(1)
+        return x, torch.tensor(0.0), olens
+    def forward_layers(self, x: torch.Tensor, tgt_mask: torch.Tensor,
+                       memory: torch.Tensor,
+                       memory_mask: torch.Tensor) -> torch.Tensor:
+        for layer in self.decoders:
+            x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory,
+                                                     memory_mask)
+        return x
+    @torch.jit.unused
+    def forward_layers_checkpointed(self, x: torch.Tensor,
+                                    tgt_mask: torch.Tensor,
+                                    memory: torch.Tensor,
+                                    memory_mask: torch.Tensor) -> torch.Tensor:
+        for layer in self.decoders:
+            x, tgt_mask, memory, memory_mask = ckpt.checkpoint(
+                layer.__call__, x, tgt_mask, memory, memory_mask)
+        return x
+    def forward_one_step(
+        self,
+        memory: torch.Tensor,
+        memory_mask: torch.Tensor,
+        tgt: torch.Tensor,
+        tgt_mask: torch.Tensor,
+        cache: Optional[List[torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        """Forward one step.
+            This is only used for decoding.
+        Args:
+            memory: encoded memory, float32  (batch, maxlen_in, feat)
+            memory_mask: encoded memory mask, (batch, 1, maxlen_in)
+            tgt: input token ids, int64 (batch, maxlen_out)
+            tgt_mask: input token mask,  (batch, maxlen_out)
+                      dtype=torch.uint8 in PyTorch 1.2-
+                      dtype=torch.bool in PyTorch 1.2+ (include 1.2)
+            cache: cached output list of (batch, max_time_out-1, size)
+        Returns:
+            y, cache: NN output value and cache per `self.decoders`.
+            y.shape` is (batch, maxlen_out, token)
+        """
+        x, _ = self.embed(tgt)
+        new_cache = []
+        for i, decoder in enumerate(self.decoders):
+            if cache is None:
+                c = None
+            else:
+                c = cache[i]
+            x, tgt_mask, memory, memory_mask = decoder(x,
+                                                       tgt_mask,
+                                                       memory,
+                                                       memory_mask,
+                                                       cache=c)
+            new_cache.append(x)
+        if self.normalize_before:
+            y = self.after_norm(x[:, -1])
+        else:
+            y = x[:, -1]
+        if self.use_output_layer:
+            y = torch.log_softmax(self.output_layer(y), dim=-1)
+        return y, new_cache
+    def tie_or_clone_weights(self, jit_mode: bool = True):
+        """Tie or clone module weights (between word_emb and output_layer)
+            depending of whether we are using TorchScript or not"""
+        if not self.use_output_layer:
+            return
+        if jit_mode:
+            logging.info("clone emb.weight to output.weight")
+            self.output_layer.weight = torch.nn.Parameter(
+                self.embed[0].weight.clone())
+        else:
+            logging.info("tie emb.weight with output.weight")
+            self.output_layer.weight = self.embed[0].weight
+        if getattr(self.output_layer, "bias", None) is not None:
+            self.output_layer.bias.data = torch.nn.functional.pad(
+                self.output_layer.bias.data,
+                (
+                    0,
+                    self.output_layer.weight.shape[0] -
+                    self.output_layer.bias.shape[0],
+                ),
+                "constant",
+                0,
+            )
+class BiTransformerDecoder(torch.nn.Module):
+    """Base class of Transfomer decoder module.
+    Args:
+        vocab_size: output dim
+        encoder_output_size: dimension of attention
+        attention_heads: the number of heads of multi head attention
+        linear_units: the hidden units number of position-wise feedforward
+        num_blocks: the number of decoder blocks
+        r_num_blocks: the number of right to left decoder blocks
+        dropout_rate: dropout rate
+        self_attention_dropout_rate: dropout rate for attention
+        input_layer: input layer type
+        use_output_layer: whether to use output layer
+        pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
+        normalize_before:
+            True: use layer_norm before each sub-block of a layer.
+            False: use layer_norm after each sub-block of a layer.
+        key_bias: whether use bias in attention.linear_k, False for whisper models.
+    """
+    def __init__(
+        self,
+        vocab_size: int,
+        encoder_output_size: int,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        r_num_blocks: int = 0,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        self_attention_dropout_rate: float = 0.0,
+        src_attention_dropout_rate: float = 0.0,
+        input_layer: str = "embed",
+        use_output_layer: bool = True,
+        normalize_before: bool = True,
+        key_bias: bool = True,
+        gradient_checkpointing: bool = False,
+        tie_word_embedding: bool = False,
+    ):
+        super().__init__()
+        self.tie_word_embedding = tie_word_embedding
+        self.left_decoder = TransformerDecoder(
+            vocab_size,
+            encoder_output_size,
+            attention_heads,
+            linear_units,
+            num_blocks,
+            dropout_rate,
+            positional_dropout_rate,
+            self_attention_dropout_rate,
+            src_attention_dropout_rate,
+            input_layer,
+            use_output_layer,
+            normalize_before,
+            key_bias=key_bias,
+            gradient_checkpointing=gradient_checkpointing,
+            tie_word_embedding=tie_word_embedding)
+        self.right_decoder = TransformerDecoder(
+            vocab_size,
+            encoder_output_size,
+            attention_heads,
+            linear_units,
+            r_num_blocks,
+            dropout_rate,
+            positional_dropout_rate,
+            self_attention_dropout_rate,
+            src_attention_dropout_rate,
+            input_layer,
+            use_output_layer,
+            normalize_before,
+            key_bias=key_bias,
+            gradient_checkpointing=gradient_checkpointing,
+            tie_word_embedding=tie_word_embedding)
+    def forward(
+        self,
+        memory: torch.Tensor,
+        memory_mask: torch.Tensor,
+        ys_in_pad: torch.Tensor,
+        ys_in_lens: torch.Tensor,
+        r_ys_in_pad: torch.Tensor,
+        reverse_weight: float = 0.0,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Forward decoder.
+        Args:
+            memory: encoded memory, float32  (batch, maxlen_in, feat)
+            memory_mask: encoder memory mask, (batch, 1, maxlen_in)
+            ys_in_pad: padded input token ids, int64 (batch, maxlen_out)
+            ys_in_lens: input lengths of this batch (batch)
+            r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out),
+                used for right to left decoder
+            reverse_weight: used for right to left decoder
+        Returns:
+            (tuple): tuple containing:
+                x: decoded token score before softmax (batch, maxlen_out,
+                    vocab_size) if use_output_layer is True,
+                r_x: x: decoded token score (right to left decoder)
+                    before softmax (batch, maxlen_out, vocab_size)
+                    if use_output_layer is True,
+                olens: (batch, )
+        """
+        l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad,
+                                          ys_in_lens)
+        r_x = torch.tensor(0.0)
+        if reverse_weight > 0.0:
+            r_x, _, olens = self.right_decoder(memory, memory_mask,
+                                               r_ys_in_pad, ys_in_lens)
+        return l_x, r_x, olens
+    def forward_one_step(
+        self,
+        memory: torch.Tensor,
+        memory_mask: torch.Tensor,
+        tgt: torch.Tensor,
+        tgt_mask: torch.Tensor,
+        cache: Optional[List[torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        """Forward one step.
+            This is only used for decoding.
+        Args:
+            memory: encoded memory, float32  (batch, maxlen_in, feat)
+            memory_mask: encoded memory mask, (batch, 1, maxlen_in)
+            tgt: input token ids, int64 (batch, maxlen_out)
+            tgt_mask: input token mask,  (batch, maxlen_out)
+                      dtype=torch.uint8 in PyTorch 1.2-
+                      dtype=torch.bool in PyTorch 1.2+ (include 1.2)
+            cache: cached output list of (batch, max_time_out-1, size)
+        Returns:
+            y, cache: NN output value and cache per `self.decoders`.
+            y.shape` is (batch, maxlen_out, token)
+        """
+        return self.left_decoder.forward_one_step(memory, memory_mask, tgt,
+                                                  tgt_mask, cache)
+    def tie_or_clone_weights(self, jit_mode: bool = True):
+        """Tie or clone module weights (between word_emb and output_layer)
+            depending of whether we are using TorchScript or not"""
+        self.left_decoder.tie_or_clone_weights(jit_mode)
+        self.right_decoder.tie_or_clone_weights(jit_mode)

inspiremusic/transformer/decoder_layer.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# Copyright (c) 2019 Shigeki Karita
+#               2020 Mobvoi Inc (Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Decoder self-attention layer definition."""
+from typing import Optional, Tuple
+import torch
+from torch import nn
+class DecoderLayer(nn.Module):
+    """Single decoder layer module.
+    Args:
+        size (int): Input dimension.
+        self_attn (torch.nn.Module): Self-attention module instance.
+            `MultiHeadedAttention` instance can be used as the argument.
+        src_attn (torch.nn.Module): Inter-attention module instance.
+            `MultiHeadedAttention` instance can be used as the argument.
+            If `None` is passed, Inter-attention is not used, such as
+            CIF, GPT, and other decoder only model.
+        feed_forward (torch.nn.Module): Feed-forward module instance.
+            `PositionwiseFeedForward` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool):
+            True: use layer_norm before each sub-block.
+            False: to use layer_norm after each sub-block.
+    """
+    def __init__(
+        self,
+        size: int,
+        self_attn: nn.Module,
+        src_attn: Optional[nn.Module],
+        feed_forward: nn.Module,
+        dropout_rate: float,
+        normalize_before: bool = True,
+    ):
+        """Construct an DecoderLayer object."""
+        super().__init__()
+        self.size = size
+        self.self_attn = self_attn
+        self.src_attn = src_attn
+        self.feed_forward = feed_forward
+        self.norm1 = nn.LayerNorm(size, eps=1e-5)
+        self.norm2 = nn.LayerNorm(size, eps=1e-5)
+        self.norm3 = nn.LayerNorm(size, eps=1e-5)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.normalize_before = normalize_before
+    def forward(
+        self,
+        tgt: torch.Tensor,
+        tgt_mask: torch.Tensor,
+        memory: torch.Tensor,
+        memory_mask: torch.Tensor,
+        cache: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Compute decoded features.
+        Args:
+            tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
+            tgt_mask (torch.Tensor): Mask for input tensor
+                (#batch, maxlen_out).
+            memory (torch.Tensor): Encoded memory
+                (#batch, maxlen_in, size).
+            memory_mask (torch.Tensor): Encoded memory mask
+                (#batch, maxlen_in).
+            cache (torch.Tensor): cached tensors.
+                (#batch, maxlen_out - 1, size).
+        Returns:
+            torch.Tensor: Output tensor (#batch, maxlen_out, size).
+            torch.Tensor: Mask for output tensor (#batch, maxlen_out).
+            torch.Tensor: Encoded memory (#batch, maxlen_in, size).
+            torch.Tensor: Encoded memory mask (#batch, maxlen_in).
+        """
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm1(tgt)
+        if cache is None:
+            tgt_q = tgt
+            tgt_q_mask = tgt_mask
+        else:
+            # compute only the last frame query keeping dim: max_time_out -> 1
+            assert cache.shape == (
+                tgt.shape[0],
+                tgt.shape[1] - 1,
+                self.size,
+            ), "{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}"
+            tgt_q = tgt[:, -1:, :]
+            residual = residual[:, -1:, :]
+            tgt_q_mask = tgt_mask[:, -1:, :]
+        x = residual + self.dropout(
+            self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0])
+        if not self.normalize_before:
+            x = self.norm1(x)
+        if self.src_attn is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm2(x)
+            x = residual + self.dropout(
+                self.src_attn(x, memory, memory, memory_mask)[0])
+            if not self.normalize_before:
+                x = self.norm2(x)
+        residual = x
+        if self.normalize_before:
+            x = self.norm3(x)
+        x = residual + self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm3(x)
+        if cache is not None:
+            x = torch.cat([cache, x], dim=1)
+        return x, tgt_mask, memory, memory_mask

inspiremusic/transformer/embedding.py ADDED Viewed

	@@ -0,0 +1,294 @@

+# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
+#               2024 Alibaba Inc (Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Positonal Encoding Module."""
+import math
+from typing import Tuple, Union
+import torch
+import torch.nn.functional as F
+import numpy as np
+class PositionalEncoding(torch.nn.Module):
+    """Positional encoding.
+    :param int d_model: embedding dim
+    :param float dropout_rate: dropout rate
+    :param int max_len: maximum input length
+    PE(pos, 2i)   = sin(pos/(10000^(2i/dmodel)))
+    PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
+    """
+    def __init__(self,
+                 d_model: int,
+                 dropout_rate: float,
+                 max_len: int = 5000,
+                 reverse: bool = False):
+        """Construct an PositionalEncoding object."""
+        super().__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.max_len = max_len
+        self.pe = torch.zeros(self.max_len, self.d_model)
+        position = torch.arange(0, self.max_len,
+                                dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32) *
+            -(math.log(10000.0) / self.d_model))
+        self.pe[:, 0::2] = torch.sin(position * div_term)
+        self.pe[:, 1::2] = torch.cos(position * div_term)
+        self.pe = self.pe.unsqueeze(0)
+    def forward(self,
+                x: torch.Tensor,
+                offset: Union[int, torch.Tensor] = 0) \
+            -> Tuple[torch.Tensor, torch.Tensor]:
+        """Add positional encoding.
+        Args:
+            x (torch.Tensor): Input. Its shape is (batch, time, ...)
+            offset (int, torch.tensor): position offset
+        Returns:
+            torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
+            torch.Tensor: for compatibility to RelPositionalEncoding
+        """
+        self.pe = self.pe.to(x.device)
+        pos_emb = self.position_encoding(offset, x.size(1), False)
+        x = x * self.xscale + pos_emb
+        return self.dropout(x), self.dropout(pos_emb)
+    def position_encoding(self,
+                          offset: Union[int, torch.Tensor],
+                          size: int,
+                          apply_dropout: bool = True) -> torch.Tensor:
+        """ For getting encoding in a streaming fashion
+        Attention!!!!!
+        we apply dropout only once at the whole utterance level in a none
+        streaming way, but will call this function several times with
+        increasing input size in a streaming scenario, so the dropout will
+        be applied several times.
+        Args:
+            offset (int or torch.tensor): start offset
+            size (int): required size of position encoding
+        Returns:
+            torch.Tensor: Corresponding encoding
+        """
+        # How to subscript a Union type:
+        #   https://github.com/pytorch/pytorch/issues/69434
+        if isinstance(offset, int):
+            assert offset + size <= self.max_len
+            pos_emb = self.pe[:, offset:offset + size]
+        elif isinstance(offset, torch.Tensor) and offset.dim() == 0:  # scalar
+            assert offset + size <= self.max_len
+            pos_emb = self.pe[:, offset:offset + size]
+        else:  # for batched streaming decoding on GPU
+            assert torch.max(offset) + size <= self.max_len
+            index = offset.unsqueeze(1) + \
+                torch.arange(0, size).to(offset.device)  # B X T
+            flag = index > 0
+            # remove negative offset
+            index = index * flag
+            pos_emb = F.embedding(index, self.pe[0])  # B X T X d_model
+        if apply_dropout:
+            pos_emb = self.dropout(pos_emb)
+        return pos_emb
+class RelPositionalEncoding(PositionalEncoding):
+    """Relative positional encoding module.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
+        """Initialize class."""
+        super().__init__(d_model, dropout_rate, max_len, reverse=True)
+    def forward(self,
+                x: torch.Tensor,
+                offset: Union[int, torch.Tensor] = 0) \
+            -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+            torch.Tensor: Positional embedding tensor (1, time, `*`).
+        """
+        self.pe = self.pe.to(x.device)
+        x = x * self.xscale
+        pos_emb = self.position_encoding(offset, x.size(1), False)
+        return self.dropout(x), self.dropout(pos_emb)
+class WhisperPositionalEncoding(PositionalEncoding):
+    """ Sinusoids position encoding used in openai-whisper.encoder
+    """
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 1500):
+        super().__init__(d_model, dropout_rate, max_len)
+        self.xscale = 1.0
+        log_timescale_increment = np.log(10000) / (d_model // 2 - 1)
+        inv_timescales = torch.exp(-log_timescale_increment *
+                                   torch.arange(d_model // 2))
+        scaled_time = torch.arange(max_len)[:, np.newaxis] * \
+            inv_timescales[np.newaxis, :]
+        pe = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
+        delattr(self, "pe")
+        self.register_buffer("pe", pe.unsqueeze(0))
+class LearnablePositionalEncoding(PositionalEncoding):
+    """ Learnable position encoding used in openai-whisper.decoder
+    """
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 448):
+        super().__init__(d_model, dropout_rate, max_len)
+        # NOTE(xcsong): overwrite self.pe & self.xscale
+        self.pe = torch.nn.Parameter(torch.empty(1, max_len, d_model))
+        self.xscale = 1.0
+class NoPositionalEncoding(torch.nn.Module):
+    """ No position encoding
+    """
+    def __init__(self, d_model: int, dropout_rate: float):
+        super().__init__()
+        self.d_model = d_model
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+    def forward(self,
+                x: torch.Tensor,
+                offset: Union[int, torch.Tensor] = 0) \
+            -> Tuple[torch.Tensor, torch.Tensor]:
+        """ Just return zero vector for interface compatibility
+        """
+        pos_emb = torch.zeros(1, x.size(1), self.d_model).to(x.device)
+        return self.dropout(x), pos_emb
+    def position_encoding(self, offset: Union[int, torch.Tensor],
+                          size: int) -> torch.Tensor:
+        return torch.zeros(1, size, self.d_model)
+class EspnetRelPositionalEncoding(torch.nn.Module):
+    """Relative positional encoding module (new implementation).
+    Details can be found in https://github.com/espnet/espnet/pull/2816.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+    def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000):
+        """Construct an PositionalEncoding object."""
+        super(EspnetRelPositionalEncoding, self).__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+    def extend_pe(self, x: torch.Tensor):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if self.pe.size(1) >= x.size(1) * 2 - 1:
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        # Suppose `i` means to the position of query vecotr and `j` means the
+        # position of key vector. We use position relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        pe_positive = torch.zeros(x.size(1), self.d_model)
+        pe_negative = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32)
+            * -(math.log(10000.0) / self.d_model)
+        )
+        pe_positive[:, 0::2] = torch.sin(position * div_term)
+        pe_positive[:, 1::2] = torch.cos(position * div_term)
+        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
+        # Reserve the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://arxiv.org/abs/1901.02860
+        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = torch.cat([pe_positive, pe_negative], dim=1)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+    def forward(self, x: torch.Tensor, offset: Union[int, torch.Tensor] = 0) \
+            -> Tuple[torch.Tensor, torch.Tensor]:
+        """Add positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+        """
+        self.extend_pe(x)
+        x = x * self.xscale
+        pos_emb = self.position_encoding(size=x.size(1), offset=offset)
+        return self.dropout(x), self.dropout(pos_emb)
+    def position_encoding(self,
+                          offset: Union[int, torch.Tensor],
+                          size: int) -> torch.Tensor:
+        """ For getting encoding in a streaming fashion
+        Attention!!!!!
+        we apply dropout only once at the whole utterance level in a none
+        streaming way, but will call this function several times with
+        increasing input size in a streaming scenario, so the dropout will
+        be applied several times.
+        Args:
+            offset (int or torch.tensor): start offset
+            size (int): required size of position encoding
+        Returns:
+            torch.Tensor: Corresponding encoding
+        """
+        pos_emb = self.pe[
+            :,
+            self.pe.size(1) // 2 - size + 1: self.pe.size(1) // 2 + size,
+        ]
+        return pos_emb

inspiremusic/transformer/encoder.py ADDED Viewed

	@@ -0,0 +1,477 @@

+# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
+#               2022 Xingchen Song ([email protected])
+#               2024 Alibaba Inc (Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Encoder definition."""
+from typing import Tuple
+import torch
+import torch.utils.checkpoint as ckpt
+from inspiremusic.transformer.convolution import ConvolutionModule
+from inspiremusic.transformer.encoder_layer import TransformerEncoderLayer
+from inspiremusic.transformer.encoder_layer import ConformerEncoderLayer
+from inspiremusic.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from inspiremusic.utils.class_utils import (
+    INSPIREMUSIC_EMB_CLASSES,
+    INSPIREMUSIC_SUBSAMPLE_CLASSES,
+    INSPIREMUSIC_ATTENTION_CLASSES,
+    INSPIREMUSIC_ACTIVATION_CLASSES,
+)
+from inspiremusic.utils.mask import make_pad_mask
+from inspiremusic.utils.mask import add_optional_chunk_mask
+class BaseEncoder(torch.nn.Module):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: str = "conv2d",
+        pos_enc_layer_type: str = "abs_pos",
+        normalize_before: bool = True,
+        static_chunk_size: int = 0,
+        use_dynamic_chunk: bool = False,
+        global_cmvn: torch.nn.Module = None,
+        use_dynamic_left_chunk: bool = False,
+        gradient_checkpointing: bool = False,
+    ):
+        """
+        Args:
+            input_size (int): input dim
+            output_size (int): dimension of attention
+            attention_heads (int): the number of heads of multi head attention
+            linear_units (int): the hidden units number of position-wise feed
+                forward
+            num_blocks (int): the number of decoder blocks
+            dropout_rate (float): dropout rate
+            attention_dropout_rate (float): dropout rate in attention
+            positional_dropout_rate (float): dropout rate after adding
+                positional encoding
+            input_layer (str): input layer type.
+                optional [linear, conv2d, conv2d6, conv2d8]
+            pos_enc_layer_type (str): Encoder positional encoding layer type.
+                opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos]
+            normalize_before (bool):
+                True: use layer_norm before each sub-block of a layer.
+                False: use layer_norm after each sub-block of a layer.
+            static_chunk_size (int): chunk size for static chunk training and
+                decoding
+            use_dynamic_chunk (bool): whether use dynamic chunk size for
+                training or not, You can only use fixed chunk(chunk_size > 0)
+                or dyanmic chunk size(use_dynamic_chunk = True)
+            global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module
+            use_dynamic_left_chunk (bool): whether use dynamic left chunk in
+                dynamic chunk training
+            key_bias: whether use bias in attention.linear_k, False for whisper models.
+            gradient_checkpointing: rerunning a forward-pass segment for each
+                checkpointed segment during backward.
+        """
+        super().__init__()
+        self._output_size = output_size
+        self.global_cmvn = global_cmvn
+        self.embed = INSPIREMUSIC_SUBSAMPLE_CLASSES[input_layer](
+            input_size,
+            output_size,
+            dropout_rate,
+            INSPIREMUSIC_EMB_CLASSES[pos_enc_layer_type](output_size,
+                                                      positional_dropout_rate),
+        )
+        self.normalize_before = normalize_before
+        self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-5)
+        self.static_chunk_size = static_chunk_size
+        self.use_dynamic_chunk = use_dynamic_chunk
+        self.use_dynamic_left_chunk = use_dynamic_left_chunk
+        self.gradient_checkpointing = gradient_checkpointing
+    def output_size(self) -> int:
+        return self._output_size
+    def forward(
+        self,
+        xs: torch.Tensor,
+        xs_lens: torch.Tensor,
+        decoding_chunk_size: int = 0,
+        num_decoding_left_chunks: int = -1,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Embed positions in tensor.
+        Args:
+            xs: padded input tensor (B, T, D)
+            xs_lens: input length (B)
+            decoding_chunk_size: decoding chunk size for dynamic chunk
+                0: default for training, use random dynamic chunk.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+            num_decoding_left_chunks: number of left chunks, this is for decoding,
+            the chunk size is decoding_chunk_size.
+                >=0: use num_decoding_left_chunks
+                <0: use all left chunks
+        Returns:
+            encoder output tensor xs, and subsampled masks
+            xs: padded output tensor (B, T' ~= T/subsample_rate, D)
+            masks: torch.Tensor batch padding mask after subsample
+                (B, 1, T' ~= T/subsample_rate)
+        NOTE(xcsong):
+            We pass the `__call__` method of the modules instead of `forward` to the
+            checkpointing API because `__call__` attaches all the hooks of the module.
+            https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2
+        """
+        T = xs.size(1)
+        masks = ~make_pad_mask(xs_lens, T).unsqueeze(1)  # (B, 1, T)
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+        xs, pos_emb, masks = self.embed(xs, masks)
+        mask_pad = masks  # (B, 1, T/subsample_rate)
+        chunk_masks = add_optional_chunk_mask(xs, masks,
+                                              self.use_dynamic_chunk,
+                                              self.use_dynamic_left_chunk,
+                                              decoding_chunk_size,
+                                              self.static_chunk_size,
+                                              num_decoding_left_chunks)
+        if self.gradient_checkpointing and self.training:
+            xs = self.forward_layers_checkpointed(xs, chunk_masks, pos_emb,
+                                                  mask_pad)
+        else:
+            xs = self.forward_layers(xs, chunk_masks, pos_emb, mask_pad)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        # Here we assume the mask is not changed in encoder layers, so just
+        # return the masks before encoder layers, and the masks will be used
+        # for cross attention with decoder later
+        return xs, masks
+    def forward_layers(self, xs: torch.Tensor, chunk_masks: torch.Tensor,
+                       pos_emb: torch.Tensor,
+                       mask_pad: torch.Tensor) -> torch.Tensor:
+        for layer in self.encoders:
+            xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
+        return xs
+    @torch.jit.unused
+    def forward_layers_checkpointed(self, xs: torch.Tensor,
+                                    chunk_masks: torch.Tensor,
+                                    pos_emb: torch.Tensor,
+                                    mask_pad: torch.Tensor) -> torch.Tensor:
+        for layer in self.encoders:
+            xs, chunk_masks, _, _ = ckpt.checkpoint(layer.__call__, xs,
+                                                    chunk_masks, pos_emb,
+                                                    mask_pad)
+        return xs
+    @torch.jit.export
+    def forward_chunk(
+        self,
+        xs: torch.Tensor,
+        offset: int,
+        required_cache_size: int,
+        att_cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
+        cnn_cache: torch.Tensor = torch.zeros(0, 0, 0, 0),
+        att_mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """ Forward just one chunk
+        Args:
+            xs (torch.Tensor): chunk input, with shape (b=1, time, mel-dim),
+                where `time == (chunk_size - 1) * subsample_rate + \
+                        subsample.right_context + 1`
+            offset (int): current offset in encoder output time stamp
+            required_cache_size (int): cache size required for next chunk
+                compuation
+                >=0: actual cache size
+                <0: means all history cache is required
+            att_cache (torch.Tensor): cache tensor for KEY & VALUE in
+                transformer/conformer attention, with shape
+                (elayers, head, cache_t1, d_k * 2), where
+                `head * d_k == hidden-dim` and
+                `cache_t1 == chunk_size * num_decoding_left_chunks`.
+            cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer,
+                (elayers, b=1, hidden-dim, cache_t2), where
+                `cache_t2 == cnn.lorder - 1`
+        Returns:
+            torch.Tensor: output of current input xs,
+                with shape (b=1, chunk_size, hidden-dim).
+            torch.Tensor: new attention cache required for next chunk, with
+                dynamic shape (elayers, head, ?, d_k * 2)
+                depending on required_cache_size.
+            torch.Tensor: new conformer cnn cache required for next chunk, with
+                same shape as the original cnn_cache.
+        """
+        assert xs.size(0) == 1
+        # tmp_masks is just for interface compatibility
+        tmp_masks = torch.ones(1,
+                               xs.size(1),
+                               device=xs.device,
+                               dtype=torch.bool)
+        tmp_masks = tmp_masks.unsqueeze(1)
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+        # NOTE(xcsong): Before embed, shape(xs) is (b=1, time, mel-dim)
+        xs, pos_emb, _ = self.embed(xs, tmp_masks, offset)
+        # NOTE(xcsong): After  embed, shape(xs) is (b=1, chunk_size, hidden-dim)
+        elayers, cache_t1 = att_cache.size(0), att_cache.size(2)
+        chunk_size = xs.size(1)
+        attention_key_size = cache_t1 + chunk_size
+        pos_emb = self.embed.position_encoding(offset=offset - cache_t1,
+                                               size=attention_key_size)
+        if required_cache_size < 0:
+            next_cache_start = 0
+        elif required_cache_size == 0:
+            next_cache_start = attention_key_size
+        else:
+            next_cache_start = max(attention_key_size - required_cache_size, 0)
+        r_att_cache = []
+        r_cnn_cache = []
+        for i, layer in enumerate(self.encoders):
+            # NOTE(xcsong): Before layer.forward
+            #   shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2),
+            #   shape(cnn_cache[i])       is (b=1, hidden-dim, cache_t2)
+            xs, _, new_att_cache, new_cnn_cache = layer(
+                xs,
+                att_mask,
+                pos_emb,
+                att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache,
+                cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache)
+            # NOTE(xcsong): After layer.forward
+            #   shape(new_att_cache) is (1, head, attention_key_size, d_k * 2),
+            #   shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2)
+            r_att_cache.append(new_att_cache[:, :, next_cache_start:, :])
+            r_cnn_cache.append(new_cnn_cache.unsqueeze(0))
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2),
+        #   ? may be larger than cache_t1, it depends on required_cache_size
+        r_att_cache = torch.cat(r_att_cache, dim=0)
+        # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2)
+        r_cnn_cache = torch.cat(r_cnn_cache, dim=0)
+        return (xs, r_att_cache, r_cnn_cache)
+    @torch.jit.unused
+    def forward_chunk_by_chunk(
+        self,
+        xs: torch.Tensor,
+        decoding_chunk_size: int,
+        num_decoding_left_chunks: int = -1,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """ Forward input chunk by chunk with chunk_size like a streaming
+            fashion
+        Here we should pay special attention to computation cache in the
+        streaming style forward chunk by chunk. Three things should be taken
+        into account for computation in the current network:
+            1. transformer/conformer encoder layers output cache
+            2. convolution in conformer
+            3. convolution in subsampling
+        However, we don't implement subsampling cache for:
+            1. We can control subsampling module to output the right result by
+               overlapping input instead of cache left context, even though it
+               wastes some computation, but subsampling only takes a very
+               small fraction of computation in the whole model.
+            2. Typically, there are several covolution layers with subsampling
+               in subsampling module, it is tricky and complicated to do cache
+               with different convolution layers with different subsampling
+               rate.
+            3. Currently, nn.Sequential is used to stack all the convolution
+               layers in subsampling, we need to rewrite it to make it work
+               with cache, which is not preferred.
+        Args:
+            xs (torch.Tensor): (1, max_len, dim)
+            chunk_size (int): decoding chunk size
+        """
+        assert decoding_chunk_size > 0
+        # The model is trained by static or dynamic chunk
+        assert self.static_chunk_size > 0 or self.use_dynamic_chunk
+        subsampling = self.embed.subsampling_rate
+        context = self.embed.right_context + 1  # Add current frame
+        stride = subsampling * decoding_chunk_size
+        decoding_window = (decoding_chunk_size - 1) * subsampling + context
+        num_frames = xs.size(1)
+        att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device)
+        cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0), device=xs.device)
+        outputs = []
+        offset = 0
+        required_cache_size = decoding_chunk_size * num_decoding_left_chunks
+        # Feed forward overlap input step by step
+        for cur in range(0, num_frames - context + 1, stride):
+            end = min(cur + decoding_window, num_frames)
+            chunk_xs = xs[:, cur:end, :]
+            (y, att_cache,
+             cnn_cache) = self.forward_chunk(chunk_xs, offset,
+                                             required_cache_size, att_cache,
+                                             cnn_cache)
+            outputs.append(y)
+            offset += y.size(1)
+        ys = torch.cat(outputs, 1)
+        masks = torch.ones((1, 1, ys.size(1)),
+                           device=ys.device,
+                           dtype=torch.bool)
+        return ys, masks
+class TransformerEncoder(BaseEncoder):
+    """Transformer encoder module."""
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: str = "conv2d",
+        pos_enc_layer_type: str = "abs_pos",
+        normalize_before: bool = True,
+        static_chunk_size: int = 0,
+        use_dynamic_chunk: bool = False,
+        global_cmvn: torch.nn.Module = None,
+        use_dynamic_left_chunk: bool = False,
+        key_bias: bool = True,
+        selfattention_layer_type: str = "selfattn",
+        activation_type: str = "relu",
+        gradient_checkpointing: bool = False,
+    ):
+        """ Construct TransformerEncoder
+        See Encoder for the meaning of each parameter.
+        """
+        super().__init__(input_size, output_size, attention_heads,
+                         linear_units, num_blocks, dropout_rate,
+                         positional_dropout_rate, attention_dropout_rate,
+                         input_layer, pos_enc_layer_type, normalize_before,
+                         static_chunk_size, use_dynamic_chunk, global_cmvn,
+                         use_dynamic_left_chunk, gradient_checkpointing)
+        activation = INSPIREMUSIC_ACTIVATION_CLASSES[activation_type]()
+        self.encoders = torch.nn.ModuleList([
+            TransformerEncoderLayer(
+                output_size,
+                INSPIREMUSIC_ATTENTION_CLASSES[selfattention_layer_type](attention_heads,
+                                                                      output_size,
+                                                                      attention_dropout_rate,
+                                                                      key_bias),
+                PositionwiseFeedForward(output_size, linear_units,
+                                        dropout_rate, activation),
+                dropout_rate, normalize_before) for _ in range(num_blocks)
+        ])
+class ConformerEncoder(BaseEncoder):
+    """Conformer encoder module."""
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: str = "conv2d",
+        pos_enc_layer_type: str = "rel_pos",
+        normalize_before: bool = True,
+        static_chunk_size: int = 0,
+        use_dynamic_chunk: bool = False,
+        global_cmvn: torch.nn.Module = None,
+        use_dynamic_left_chunk: bool = False,
+        positionwise_conv_kernel_size: int = 1,
+        macaron_style: bool = True,
+        selfattention_layer_type: str = "rel_selfattn",
+        activation_type: str = "swish",
+        use_cnn_module: bool = True,
+        cnn_module_kernel: int = 15,
+        causal: bool = False,
+        cnn_module_norm: str = "batch_norm",
+        key_bias: bool = True,
+        gradient_checkpointing: bool = False,
+    ):
+        """Construct ConformerEncoder
+        Args:
+            input_size to use_dynamic_chunk, see in BaseEncoder
+            positionwise_conv_kernel_size (int): Kernel size of positionwise
+                conv1d layer.
+            macaron_style (bool): Whether to use macaron style for
+                positionwise layer.
+            selfattention_layer_type (str): Encoder attention layer type,
+                the parameter has no effect now, it's just for configure
+                compatibility.
+            activation_type (str): Encoder activation function type.
+            use_cnn_module (bool): Whether to use convolution module.
+            cnn_module_kernel (int): Kernel size of convolution module.
+            causal (bool): whether to use causal convolution or not.
+            key_bias: whether use bias in attention.linear_k, False for whisper models.
+        """
+        super().__init__(input_size, output_size, attention_heads,
+                         linear_units, num_blocks, dropout_rate,
+                         positional_dropout_rate, attention_dropout_rate,
+                         input_layer, pos_enc_layer_type, normalize_before,
+                         static_chunk_size, use_dynamic_chunk, global_cmvn,
+                         use_dynamic_left_chunk, gradient_checkpointing)
+        activation = INSPIREMUSIC_ACTIVATION_CLASSES[activation_type]()
+        # self-attention module definition
+        encoder_selfattn_layer_args = (
+            attention_heads,
+            output_size,
+            attention_dropout_rate,
+            key_bias,
+        )
+        # feed-forward module definition
+        positionwise_layer_args = (
+            output_size,
+            linear_units,
+            dropout_rate,
+            activation,
+        )
+        # convolution module definition
+        convolution_layer_args = (output_size, cnn_module_kernel, activation,
+                                  cnn_module_norm, causal)
+        self.encoders = torch.nn.ModuleList([
+            ConformerEncoderLayer(
+                output_size,
+                INSPIREMUSIC_ATTENTION_CLASSES[selfattention_layer_type](
+                    *encoder_selfattn_layer_args),
+                PositionwiseFeedForward(*positionwise_layer_args),
+                PositionwiseFeedForward(
+                    *positionwise_layer_args) if macaron_style else None,
+                ConvolutionModule(
+                    *convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+            ) for _ in range(num_blocks)
+        ])

inspiremusic/transformer/encoder_layer.py ADDED Viewed

	@@ -0,0 +1,235 @@

+# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
+#               2022 Xingchen Song ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Encoder self-attention layer definition."""
+from typing import Optional, Tuple
+import torch
+from torch import nn
+class TransformerEncoderLayer(nn.Module):
+    """Encoder layer module.
+    Args:
+        size (int): Input dimension.
+        self_attn (torch.nn.Module): Self-attention module instance.
+            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
+            instance can be used as the argument.
+        feed_forward (torch.nn.Module): Feed-forward module instance.
+            `PositionwiseFeedForward`, instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool):
+            True: use layer_norm before each sub-block.
+            False: to use layer_norm after each sub-block.
+    """
+    def __init__(
+        self,
+        size: int,
+        self_attn: torch.nn.Module,
+        feed_forward: torch.nn.Module,
+        dropout_rate: float,
+        normalize_before: bool = True,
+    ):
+        """Construct an EncoderLayer object."""
+        super().__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.norm1 = nn.LayerNorm(size, eps=1e-5)
+        self.norm2 = nn.LayerNorm(size, eps=1e-5)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        pos_emb: torch.Tensor,
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+        cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Compute encoded features.
+        Args:
+            x (torch.Tensor): (#batch, time, size)
+            mask (torch.Tensor): Mask tensor for the input (#batch, time，time),
+                (0, 0, 0) means fake mask.
+            pos_emb (torch.Tensor): just for interface compatibility
+                to ConformerEncoderLayer
+            mask_pad (torch.Tensor): does not used in transformer layer,
+                just for unified api with conformer.
+            att_cache (torch.Tensor): Cache tensor of the KEY & VALUE
+                (#batch=1, head, cache_t1, d_k * 2), head * d_k == size.
+            cnn_cache (torch.Tensor): Convolution cache in conformer layer
+                (#batch=1, size, cache_t2), not used here, it's for interface
+                compatibility to ConformerEncoderLayer.
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time, time).
+            torch.Tensor: att_cache tensor,
+                (#batch=1, head, cache_t1 + time, d_k * 2).
+            torch.Tensor: cnn_cahce tensor (#batch=1, size, cache_t2).
+        """
+        residual = x
+        if self.normalize_before:
+            x = self.norm1(x)
+        x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb=pos_emb, cache=att_cache)
+        x = residual + self.dropout(x_att)
+        if not self.normalize_before:
+            x = self.norm1(x)
+        residual = x
+        if self.normalize_before:
+            x = self.norm2(x)
+        x = residual + self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm2(x)
+        fake_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
+        return x, mask, new_att_cache, fake_cnn_cache
+class ConformerEncoderLayer(nn.Module):
+    """Encoder layer module.
+    Args:
+        size (int): Input dimension.
+        self_attn (torch.nn.Module): Self-attention module instance.
+            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
+            instance can be used as the argument.
+        feed_forward (torch.nn.Module): Feed-forward module instance.
+            `PositionwiseFeedForward` instance can be used as the argument.
+        feed_forward_macaron (torch.nn.Module): Additional feed-forward module
+             instance.
+            `PositionwiseFeedForward` instance can be used as the argument.
+        conv_module (torch.nn.Module): Convolution module instance.
+            `ConvlutionModule` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool):
+            True: use layer_norm before each sub-block.
+            False: use layer_norm after each sub-block.
+    """
+    def __init__(
+        self,
+        size: int,
+        self_attn: torch.nn.Module,
+        feed_forward: Optional[nn.Module] = None,
+        feed_forward_macaron: Optional[nn.Module] = None,
+        conv_module: Optional[nn.Module] = None,
+        dropout_rate: float = 0.1,
+        normalize_before: bool = True,
+    ):
+        """Construct an EncoderLayer object."""
+        super().__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.feed_forward_macaron = feed_forward_macaron
+        self.conv_module = conv_module
+        self.norm_ff = nn.LayerNorm(size, eps=1e-5)  # for the FNN module
+        self.norm_mha = nn.LayerNorm(size, eps=1e-5)  # for the MHA module
+        if feed_forward_macaron is not None:
+            self.norm_ff_macaron = nn.LayerNorm(size, eps=1e-5)
+            self.ff_scale = 0.5
+        else:
+            self.ff_scale = 1.0
+        if self.conv_module is not None:
+            self.norm_conv = nn.LayerNorm(size, eps=1e-5)  # for the CNN module
+            self.norm_final = nn.LayerNorm(
+                size, eps=1e-5)  # for the final output of the block
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        pos_emb: torch.Tensor,
+        mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+        cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Compute encoded features.
+        Args:
+            x (torch.Tensor): (#batch, time, size)
+            mask (torch.Tensor): Mask tensor for the input (#batch, time，time),
+                (0, 0, 0) means fake mask.
+            pos_emb (torch.Tensor): positional encoding, must not be None
+                for ConformerEncoderLayer.
+            mask_pad (torch.Tensor): batch padding mask used for conv module.
+                (#batch, 1，time), (0, 0, 0) means fake mask.
+            att_cache (torch.Tensor): Cache tensor of the KEY & VALUE
+                (#batch=1, head, cache_t1, d_k * 2), head * d_k == size.
+            cnn_cache (torch.Tensor): Convolution cache in conformer layer
+                (#batch=1, size, cache_t2)
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time, time).
+            torch.Tensor: att_cache tensor,
+                (#batch=1, head, cache_t1 + time, d_k * 2).
+            torch.Tensor: cnn_cahce tensor (#batch, size, cache_t2).
+        """
+        # whether to use macaron style
+        if self.feed_forward_macaron is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_ff_macaron(x)
+            x = residual + self.ff_scale * self.dropout(
+                self.feed_forward_macaron(x))
+            if not self.normalize_before:
+                x = self.norm_ff_macaron(x)
+        # multi-headed self-attention module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_mha(x)
+        x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb,
+                                              att_cache)
+        x = residual + self.dropout(x_att)
+        if not self.normalize_before:
+            x = self.norm_mha(x)
+        # convolution module
+        # Fake new cnn cache here, and then change it in conv_module
+        new_cnn_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
+        if self.conv_module is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_conv(x)
+            x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache)
+            x = residual + self.dropout(x)
+            if not self.normalize_before:
+                x = self.norm_conv(x)
+        # feed forward module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_ff(x)
+        x = residual + self.ff_scale * self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm_ff(x)
+        if self.conv_module is not None:
+            x = self.norm_final(x)
+        return x, mask, new_att_cache, new_cnn_cache

inspiremusic/transformer/label_smoothing_loss.py ADDED Viewed

	@@ -0,0 +1,97 @@

+# Copyright (c) 2019 Shigeki Karita
+#               2020 Mobvoi Inc (Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Label smoothing module."""
+import torch
+from torch import nn
+class LabelSmoothingLoss(nn.Module):
+    """Label-smoothing loss.
+    In a standard CE loss, the label's data distribution is:
+    [0,1,2] ->
+    [
+        [1.0, 0.0, 0.0],
+        [0.0, 1.0, 0.0],
+        [0.0, 0.0, 1.0],
+    ]
+    In the smoothing version CE Loss,some probabilities
+    are taken from the true label prob (1.0) and are divided
+    among other labels.
+    e.g.
+    smoothing=0.1
+    [0,1,2] ->
+    [
+        [0.9, 0.05, 0.05],
+        [0.05, 0.9, 0.05],
+        [0.05, 0.05, 0.9],
+    ]
+    Args:
+        size (int): the number of class
+        padding_idx (int): padding class id which will be ignored for loss
+        smoothing (float): smoothing rate (0.0 means the conventional CE)
+        normalize_length (bool):
+            normalize loss by sequence length if True
+            normalize loss by batch size if False
+    """
+    def __init__(self,
+                 size: int,
+                 padding_idx: int,
+                 smoothing: float,
+                 normalize_length: bool = False):
+        """Construct an LabelSmoothingLoss object."""
+        super(LabelSmoothingLoss, self).__init__()
+        self.criterion = nn.KLDivLoss(reduction="none")
+        self.padding_idx = padding_idx
+        self.confidence = 1.0 - smoothing
+        self.smoothing = smoothing
+        self.size = size
+        self.normalize_length = normalize_length
+    def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        """Compute loss between x and target.
+        The model outputs and data labels tensors are flatten to
+        (batch*seqlen, class) shape and a mask is applied to the
+        padding part which should not be calculated for loss.
+        Args:
+            x (torch.Tensor): prediction (batch, seqlen, class)
+            target (torch.Tensor):
+                target signal masked with self.padding_id (batch, seqlen)
+        Returns:
+            loss (torch.Tensor) : The KL loss, scalar float value
+        """
+        assert x.size(2) == self.size
+        batch_size = x.size(0)
+        x = x.view(-1, self.size)
+        target = target.view(-1)
+        # use zeros_like instead of torch.no_grad() for true_dist,
+        # since no_grad() can not be exported by JIT
+        true_dist = torch.zeros_like(x)
+        true_dist.fill_(self.smoothing / (self.size - 1))
+        ignore = target == self.padding_idx  # (B,)
+        total = len(target) - ignore.sum().item()
+        target = target.masked_fill(ignore, 0)  # avoid -1 index
+        true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
+        kl = self.criterion(torch.log_softmax(x, dim=1), true_dist)
+        denom = total if self.normalize_length else batch_size
+        return kl.masked_fill(ignore.unsqueeze(1), 0).sum() / denom

inspiremusic/transformer/positionwise_feed_forward.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# Copyright (c) 2019 Shigeki Karita
+#               2020 Mobvoi Inc (Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Positionwise feed forward layer definition."""
+import torch
+class PositionwiseFeedForward(torch.nn.Module):
+    """Positionwise feed forward layer.
+    FeedForward are appied on each position of the sequence.
+    The output dim is same with the input dim.
+    Args:
+        idim (int): Input dimenstion.
+        hidden_units (int): The number of hidden units.
+        dropout_rate (float): Dropout rate.
+        activation (torch.nn.Module): Activation function
+    """
+    def __init__(
+            self,
+            idim: int,
+            hidden_units: int,
+            dropout_rate: float,
+            activation: torch.nn.Module = torch.nn.ReLU(),
+    ):
+        """Construct a PositionwiseFeedForward object."""
+        super(PositionwiseFeedForward, self).__init__()
+        self.w_1 = torch.nn.Linear(idim, hidden_units)
+        self.activation = activation
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.w_2 = torch.nn.Linear(hidden_units, idim)
+    def forward(self, xs: torch.Tensor) -> torch.Tensor:
+        """Forward function.
+        Args:
+            xs: input tensor (B, L, D)
+        Returns:
+            output tensor, (B, L, D)
+        """
+        return self.w_2(self.dropout(self.activation(self.w_1(xs))))
+class MoEFFNLayer(torch.nn.Module):
+    """
+    Mixture of expert with Positionwise feed forward layer
+    See also figure 1 in https://arxiv.org/pdf/2305.15663.pdf
+    The output dim is same with the input dim.
+    Modified from https://github.com/Lightning-AI/lit-gpt/pull/823
+                  https://github.com/mistralai/mistral-src/blob/b46d6/moe_one_file_ref.py#L203-L219
+    Args:
+        n_expert: number of expert.
+        n_expert_per_token: The actual number of experts used for each frame
+        idim (int): Input dimenstion.
+        hidden_units (int): The number of hidden units.
+        dropout_rate (float): Dropout rate.
+        activation (torch.nn.Module): Activation function
+    """
+    def __init__(
+            self,
+            n_expert: int,
+            n_expert_per_token: int,
+            idim: int,
+            hidden_units: int,
+            dropout_rate: float,
+            activation: torch.nn.Module = torch.nn.ReLU(),
+    ):
+        super(MoEFFNLayer, self).__init__()
+        self.gate = torch.nn.Linear(idim, n_expert, bias=False)
+        self.experts = torch.nn.ModuleList(
+            PositionwiseFeedForward(idim, hidden_units, dropout_rate,
+                                    activation) for _ in range(n_expert))
+        self.n_expert_per_token = n_expert_per_token
+    def forward(self, xs: torch.Tensor) -> torch.Tensor:
+        """Foward function.
+        Args:
+            xs: input tensor (B, L, D)
+        Returns:
+            output tensor, (B, L, D)
+        """
+        B, L, D = xs.size(
+        )  # batch size, sequence length, embedding dimension (idim)
+        xs = xs.view(-1, D)  # (B*L, D)
+        router = self.gate(xs)  # (B*L, n_expert)
+        logits, indices = torch.topk(
+            router, self.n_expert_per_token
+        )  # probs:(B*L, n_expert), indices: (B*L, n_expert)
+        weights = torch.nn.functional.softmax(
+            logits, dim=1,
+            dtype=torch.float).to(dtype=xs.dtype)  # (B*L, n_expert_per_token)
+        output = torch.zeros_like(xs)  # (B*L, D)
+        for i, expert in enumerate(self.experts):
+            mask = indices == i
+            batch_idx, ith_expert = torch.where(mask)
+            output[batch_idx] += weights[batch_idx, ith_expert, None] * expert(
+                xs[batch_idx])
+        return output.view(B, L, D)

inspiremusic/transformer/qwen_encoder.py ADDED Viewed

	@@ -0,0 +1,165 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch.nn as nn
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from inspiremusic.utils.mask import make_pad_mask
+from inspiremusic.utils.hinter import hint_once
+class QwenEncoder(nn.Module):
+    def __init__(
+            self,
+            input_size: int,
+            pretrain_path: str = "Qwen/Qwen2.0-0.5B",
+            trainable: bool = False,
+            do_fusion_emb: bool = False,
+            fusion_drop_rate: float = 0.0,
+    ):
+        super(QwenEncoder, self).__init__()
+        self.input_size = input_size
+        self.trainable = trainable
+        self.model = AutoModelForCausalLM.from_pretrained(pretrain_path, device_map="cpu")
+        self._output_size = self.model.config.hidden_size
+        self.do_fusion_emb = do_fusion_emb
+        self.hidden_norm = torch.nn.LayerNorm(self._output_size)
+        self.fusion_dropout = nn.Dropout(fusion_drop_rate)
+        if do_fusion_emb:
+            self.fusion_layer = torch.nn.Linear(self._output_size * 2, self._output_size)
+            self.emb_norm = torch.nn.LayerNorm(self._output_size)
+            self.fusion_norm = torch.nn.LayerNorm(self._output_size)
+            from inspiremusic.transformer.activation import Swish
+            self.fusion_act = Swish(self)
+        if not self.trainable:
+            self.model.eval()
+    def output_size(self) -> int:
+        return self._output_size
+    def forward(
+            self,
+            input_ids: torch.Tensor,
+            ilens: torch.Tensor,
+    ):
+        device = input_ids.device
+        input_ids = torch.clamp(input_ids, min=0, max=None)
+        input_masks = (~make_pad_mask(ilens)).to(device).long()
+        if not self.trainable:
+            with torch.no_grad():
+                model_outputs = self.model(
+                    input_ids=input_ids,
+                    attention_mask=input_masks,
+                    output_hidden_states=True
+                )
+        else:
+            model_outputs = self.model(
+                input_ids=input_ids,
+                attention_mask=input_masks,
+                output_hidden_states=True
+            )
+        outs = model_outputs.hidden_states[-1]
+        outs = self.hidden_norm(outs)
+        if self.do_fusion_emb:
+            hint_once("fuse embedding and LM outputs", "fuse_emb")
+            outs = self.fusion_dropout(self.fusion_act(outs))
+            emb = model_outputs.hidden_states[0]
+            emb = self.fusion_dropout(self.fusion_act(self.emb_norm(emb)))
+            outs = self.fusion_layer(
+                torch.cat([outs, emb], dim=-1)
+            )
+            outs = self.fusion_act(self.fusion_norm(outs))
+        return outs, ilens
+class QwenEmbeddingEncoder(nn.Module):
+    def __init__(
+            self,
+            input_size: int,
+            pretrain_path: str = "Qwen/Qwen2.0-0.5B",
+    ):
+        super(QwenEmbeddingEncoder, self).__init__()
+        self.input_size = input_size
+        from transformers import Qwen2ForCausalLM
+        self.model = Qwen2ForCausalLM.from_pretrained(pretrain_path, device_map="cpu", attn_implementation="flash_attention_2")
+        self._output_size = self.model.config.hidden_size
+    def output_size(self) -> int:
+        return self._output_size
+    def forward(
+            self,
+            input_embeds: torch.Tensor,
+            ilens: torch.Tensor,
+    ):
+        input_masks = (~make_pad_mask(ilens)).to(input_embeds.device).long()
+        outs = self.model(
+            inputs_embeds=input_embeds,
+            attention_mask=input_masks,
+            output_hidden_states=True,
+            return_dict=True,
+        )
+        return outs.hidden_states[-1], input_masks
+    def forward_one_step(self, xs, masks, cache=None):
+        outs = self.model(
+            inputs_embeds=xs,
+            attention_mask=masks,
+            output_hidden_states=True,
+            return_dict=True,
+            use_cache=True,
+            past_key_values=cache,
+        )
+        xs = outs.hidden_states[-1]
+        new_cache = outs.past_key_values
+        return xs, masks, new_cache
+class QwenInputOnlyEncoder(nn.Module):
+    def __init__(
+            self,
+            input_size: int,
+            pretrain_path: str = "Qwen/Qwen2.0-0.5B",
+    ):
+        super(QwenInputOnlyEncoder, self).__init__()
+        self.input_size = input_size
+        from transformers import Qwen2ForCausalLM
+        model = Qwen2ForCausalLM.from_pretrained(pretrain_path, device_map="cpu", attn_implementation="flash_attention_2")
+        self.embed = model.model.embed_tokens
+        for p in self.embed.parameters():
+            p.requires_grad = False
+            # set text embedding to non-trainable
+        # self.post_embed = model.model.rotary_emb
+        self._output_size = model.config.hidden_size
+    def output_size(self) -> int:
+        return self._output_size
+    def forward(
+            self,
+            input_ids: torch.Tensor,
+            ilens: torch.Tensor,
+    ):
+        input_masks = (~make_pad_mask(ilens)).to(input_ids.device).long()
+        outs = self.embed(input_ids)
+        return outs, input_masks

inspiremusic/transformer/subsampling.py ADDED Viewed

	@@ -0,0 +1,384 @@

+# Copyright (c) 2021 Mobvoi Inc (Binbin Zhang, Di Wu)
+#               2024 Alibaba Inc (Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+"""Subsampling layer definition."""
+from typing import Tuple, Union
+import torch
+class BaseSubsampling(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.right_context = 0
+        self.subsampling_rate = 1
+    def position_encoding(self, offset: Union[int, torch.Tensor],
+                          size: int) -> torch.Tensor:
+        return self.pos_enc.position_encoding(offset, size)
+class EmbedinigNoSubsampling(BaseSubsampling):
+    """Embedding input without subsampling
+    """
+    def __init__(self, idim: int, odim: int, dropout_rate: float,
+                 pos_enc_class: torch.nn.Module):
+        super().__init__()
+        self.embed = torch.nn.Embedding(idim, odim)
+        self.pos_enc = pos_enc_class
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Input x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+        Returns:
+            torch.Tensor: linear input tensor (#batch, time', odim),
+                where time' = time .
+            torch.Tensor: linear input mask (#batch, 1, time'),
+                where time' = time .
+        """
+        x = self.embed(x)
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask
+class LinearNoSubsampling(BaseSubsampling):
+    """Linear transform the input without subsampling
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self, idim: int, odim: int, dropout_rate: float,
+                 pos_enc_class: torch.nn.Module):
+        """Construct an linear object."""
+        super().__init__()
+        self.out = torch.nn.Sequential(
+            torch.nn.Linear(idim, odim),
+            torch.nn.LayerNorm(odim, eps=1e-5),
+            torch.nn.Dropout(dropout_rate),
+        )
+        self.pos_enc = pos_enc_class
+        self.right_context = 0
+        self.subsampling_rate = 1
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Input x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+        Returns:
+            torch.Tensor: linear input tensor (#batch, time', odim),
+                where time' = time .
+            torch.Tensor: linear input mask (#batch, 1, time'),
+                where time' = time .
+        """
+        x = self.out(x)
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask
+class Conv1dSubsampling2(BaseSubsampling):
+    """Convolutional 1D subsampling (to 1/2 length).
+       It is designed for Whisper, ref:
+       https://github.com/openai/whisper/blob/main/whisper/model.py
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self, idim: int, odim: int, dropout_rate: float,
+                 pos_enc_class: torch.nn.Module):
+        """Construct an Conv1dSubsampling2 object."""
+        super().__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv1d(idim, odim, kernel_size=3, padding=1),
+            torch.nn.GELU(),
+            torch.nn.Conv1d(odim, odim, kernel_size=3, stride=2, padding=1),
+            torch.nn.GELU(),
+        )
+        self.pos_enc = pos_enc_class
+        # The right context for every conv layer is computed by:
+        # (kernel_size - 1) * frame_rate_of_this_layer
+        self.subsampling_rate = 2
+        # 4 = (3 - 1) * 1 + (3 - 1) * 1
+        self.right_context = 4
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Subsample x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 2.
+            torch.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 2.
+            torch.Tensor: positional encoding
+        """
+        time = x.size(1)
+        x = x.transpose(1, 2)  # (b, f, t)
+        x = self.conv(x)
+        x = x.transpose(1, 2)  # (b, t, f)
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask[:, :, (time + 1) % 2::2]
+class Conv2dSubsampling4(BaseSubsampling):
+    """Convolutional 2D subsampling (to 1/4 length).
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self, idim: int, odim: int, dropout_rate: float,
+                 pos_enc_class: torch.nn.Module):
+        """Construct an Conv2dSubsampling4 object."""
+        super().__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv2d(1, odim, 3, 2),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(odim, odim, 3, 2),
+            torch.nn.ReLU(),
+        )
+        self.out = torch.nn.Sequential(
+            torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim))
+        self.pos_enc = pos_enc_class
+        # The right context for every conv layer is computed by:
+        # (kernel_size - 1) * frame_rate_of_this_layer
+        self.subsampling_rate = 4
+        # 6 = (3 - 1) * 1 + (3 - 1) * 2
+        self.right_context = 6
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Subsample x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 4.
+            torch.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 4.
+            torch.Tensor: positional encoding
+        """
+        x = x.unsqueeze(1)  # (b, c=1, t, f)
+        x = self.conv(x)
+        b, c, t, f = x.size()
+        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2]
+class Conv2dSubsampling6(BaseSubsampling):
+    """Convolutional 2D subsampling (to 1/6 length).
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+        pos_enc (torch.nn.Module): Custom position encoding layer.
+    """
+    def __init__(self, idim: int, odim: int, dropout_rate: float,
+                 pos_enc_class: torch.nn.Module):
+        """Construct an Conv2dSubsampling6 object."""
+        super().__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv2d(1, odim, 3, 2),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(odim, odim, 5, 3),
+            torch.nn.ReLU(),
+        )
+        self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3),
+                                      odim)
+        self.pos_enc = pos_enc_class
+        # 10 = (3 - 1) * 1 + (5 - 1) * 2
+        self.subsampling_rate = 6
+        self.right_context = 10
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Subsample x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 6.
+            torch.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 6.
+            torch.Tensor: positional encoding
+        """
+        x = x.unsqueeze(1)  # (b, c, t, f)
+        x = self.conv(x)
+        b, c, t, f = x.size()
+        x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f))
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask[:, :, 2::2][:, :, 4::3]
+class Conv2dSubsampling8(BaseSubsampling):
+    """Convolutional 2D subsampling (to 1/8 length).
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self, idim: int, odim: int, dropout_rate: float,
+                 pos_enc_class: torch.nn.Module):
+        """Construct an Conv2dSubsampling8 object."""
+        super().__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv2d(1, odim, 3, 2),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(odim, odim, 3, 2),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(odim, odim, 3, 2),
+            torch.nn.ReLU(),
+        )
+        self.linear = torch.nn.Linear(
+            odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim)
+        self.pos_enc = pos_enc_class
+        self.subsampling_rate = 8
+        # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4
+        self.right_context = 14
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Subsample x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 8.
+            torch.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 8.
+            torch.Tensor: positional encoding
+        """
+        x = x.unsqueeze(1)  # (b, c, t, f)
+        x = self.conv(x)
+        b, c, t, f = x.size()
+        x = self.linear(x.transpose(1, 2).contiguous().view(b, t, c * f))
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask[:, :, 2::2][:, :, 2::2][:, :, 2::2]
+class LegacyLinearNoSubsampling(BaseSubsampling):
+    """Linear transform the input without subsampling
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self, idim: int, odim: int, dropout_rate: float,
+                 pos_enc_class: torch.nn.Module):
+        """Construct an linear object."""
+        super().__init__()
+        self.out = torch.nn.Sequential(
+            torch.nn.Linear(idim, odim),
+            torch.nn.LayerNorm(odim, eps=1e-5),
+            torch.nn.Dropout(dropout_rate),
+            torch.nn.ReLU(),
+        )
+        self.pos_enc = pos_enc_class
+        self.right_context = 0
+        self.subsampling_rate = 1
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_mask: torch.Tensor,
+        offset: Union[int, torch.Tensor] = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Input x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+        Returns:
+            torch.Tensor: linear input tensor (#batch, time', odim),
+                where time' = time .
+            torch.Tensor: linear input mask (#batch, 1, time'),
+                where time' = time .
+        """
+        x = self.out(x)
+        x, pos_emb = self.pos_enc(x, offset)
+        return x, pos_emb, x_mask

inspiremusic/utils/__init__.py ADDED Viewed

File without changes

inspiremusic/utils/audio_utils.py ADDED Viewed

	@@ -0,0 +1,623 @@

+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import io
+import logging
+import re
+import sys
+import inspect
+import random
+import typing as tp
+from functools import partial
+import omegaconf
+import torch
+import torchaudio
+import numpy as np
+from typing_extensions import Literal
+from typing import (
+    Any,
+    Union,
+    Iterable,
+    List,
+    Dict,
+    Optional,
+    Tuple,
+)
+from librosa.filters import mel as librosa_mel_fn
+from scipy.io.wavfile import read
+_BoolLike_co = Union[bool, np.bool_]
+_IntLike_co = Union[_BoolLike_co, int, "np.integer[Any]"]
+_FloatLike_co = Union[_IntLike_co, float, "np.floating[Any]"]
+def process_audio(file_path, target_sample_rate=24000):
+    audio, sample_rate = torchaudio.load(file_path)
+    # Check if the audio needs to be resampled
+    if sample_rate != target_sample_rate:
+        audio = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)(audio)
+    # Convert stereo to mono (if necessary)
+    audio = audio.mean(dim=0, keepdim=True) if audio.size(0) == 2 else audio
+    return audio, target_sample_rate
+def load_wav(full_path):
+    sampling_rate, data = read(full_path)
+    return data, sampling_rate
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
+def dynamic_range_decompression(x, C=1):
+    return np.exp(x) / C
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression_torch(x, C=1):
+    return torch.exp(x) / C
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
+    if torch.min(y) < -1.0:
+        print("min value is ", torch.min(y))
+    if torch.max(y) > 1.0:
+        print("max value is ", torch.max(y))
+    # global mel_basis, hann_window  # pylint: disable=global-statement,global-variable-not-assigned
+    mel_basis = {}
+    hann_window = {}
+    if f"{str(fmax)}_{str(y.device)}" not in mel_basis:
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[str(fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device)
+        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect"
+    )
+    y = y.squeeze(1)
+    spec = torch.view_as_real(
+        torch.stft(
+            y,
+            n_fft,
+            hop_length=hop_size,
+            win_length=win_size,
+            window=hann_window[str(y.device)],
+            center=center,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True,
+        )
+    )
+    spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
+    spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec
+def fade_out(audio: torch.Tensor, sample_rate: int,
+             fade_duration: float) -> torch.Tensor:
+    """
+    Apply a linear fade-out effect to the given audio waveform.
+    Parameters:
+    audio (torch.Tensor): The audio waveform tensor.
+    sample_rate (int): Sample rate of the audio.
+    fade_duration (float): Duration of the fade-out effect in seconds.
+    Returns:
+    torch.Tensor: The audio with the fade-out effect applied.
+    """
+    fade_samples = int(fade_duration * sample_rate)
+    if fade_samples > audio.shape[1]:
+        fade_samples = audio.shape[
+            1]  # use the whole length of audio if necessary
+    fade_out_envelope = torch.linspace(1.0, 0.0, fade_samples,
+                                       dtype=audio.dtype, device=audio.device)
+    fade_section = audio[:, -fade_samples:].clone()
+    fade_section *= fade_out_envelope
+    faded_audio = audio.clone()
+    faded_audio[:, -fade_samples:] = fade_section
+    return faded_audio
+def split_wav_into_chunks(num_samples, wav, max_chunk_size, minimum_chunk_size=720):
+    num_chunks = (num_samples + max_chunk_size - 1) // max_chunk_size  # Ceiling division
+    wav_chunks = []
+    for i in range(num_chunks):
+        start_idx = i * max_chunk_size
+        end_idx = min(start_idx + max_chunk_size, num_samples)
+        if (end_idx - start_idx) >= minimum_chunk_size:
+            if len(wav.shape) == 2:
+                chunk = wav[:,start_idx:end_idx]
+            else:
+                chunk = wav[start_idx:end_idx]
+            wav_chunks.append(chunk)
+        else:
+            print(f"{num_samples}:{num_chunks}, chunk size={(end_idx - start_idx)} is lower then minimum_chunk_size!")
+    return wav_chunks
+def tiny(x: Union[float, np.ndarray]) -> _FloatLike_co:
+    """Compute the tiny-value corresponding to an input's data type.
+    """
+    # Make sure we have an array view
+    x = np.asarray(x)
+    # Only floating types generate a tiny
+    if np.issubdtype(x.dtype, np.floating) or np.issubdtype(
+        x.dtype, np.complexfloating
+    ):
+        dtype = x.dtype
+    else:
+        dtype = np.dtype(np.float32)
+    return np.finfo(dtype).tiny
+def detect_silence(audio, sample_rate, threshold=0.05, min_silence_duration=1):
+    """
+    Detects the first occurrence of silence in the audio.
+    Parameters:
+        audio (Tensor): The audio waveform.
+        sample_rate (int): The sample rate of the audio.
+        threshold (float): The threshold below which the signal is considered silent.
+        min_silence_duration (float): The minimum duration of silence in seconds.
+    Returns:
+        int: The timestamp (in samples) where the silence starts.
+    """
+    # Convert the audio to a numpy array for easier manipulation
+    audio_np = audio.numpy().flatten()
+    # Calculate the energy of the signal
+    energy = np.abs(audio_np)
+    # Find the indices where the energy is below the threshold
+    silent_indices = np.where(energy < threshold)[0]
+    # Find the start and end of contiguous silent regions
+    silent_regions = np.split(silent_indices, np.where(np.diff(silent_indices) != 1)[0] + 1)
+    # Filter out regions that are too short
+    min_silence_samples = int(min_silence_duration * sample_rate)
+    for region in silent_regions:
+        if len(region) >= min_silence_samples:
+            return region[0]
+    # If no silence is found, return the length of the audio
+    return len(audio_np)
+def trim_audio(waveform, sample_rate=24000, threshold=0.05, min_silence_duration=1, minimum_silence_start_sample=24000):
+    """
+    Trims the audio from the beginning to the first occurrence of silence.
+    Parameters:
+        waveform (Tensor): The waveform data to the input audio file.
+        sample_rate (int): Sample rate of the input audio file.
+        threshold (float): The threshold below which the signal is considered silent.
+        min_silence_duration (float): The minimum duration of silence in seconds.
+    """
+    # Detect the first occurrence of silence
+    silence_start_sample = detect_silence(waveform, sample_rate, threshold, min_silence_duration)
+    if silence_start_sample > minimum_silence_start_sample :
+        trimmed_waveform = waveform[:silence_start_sample]
+    else:
+        trimmed_waveform = waveform[:minimum_silence_start_sample]
+    if isinstance(trimmed_waveform, torch.Tensor):
+        return trimmed_waveform
+    else:
+        return trimmed_waveform.unsqueeze()
+def normalize_loudness(wav: torch.Tensor, sample_rate: int, loudness_headroom_db: float = 14,
+                       loudness_compressor: bool = False, energy_floor: float = 2e-3):
+    """Normalize an input signal to a user loudness in dB LKFS.
+    Audio loudness is defined according to the ITU-R BS.1770-4 recommendation.
+    Args:
+        wav (torch.Tensor): Input multichannel audio data.
+        sample_rate (int): Sample rate.
+        loudness_headroom_db (float): Target loudness of the output in dB LUFS.
+        loudness_compressor (bool): Uses tanh for soft clipping.
+        energy_floor (float): anything below that RMS level will not be rescaled.
+    Returns:
+        torch.Tensor: Loudness normalized output data.
+    """
+    energy = wav.pow(2).mean().sqrt().item()
+    if energy < energy_floor:
+        return wav
+    transform = torchaudio.transforms.Loudness(sample_rate)
+    input_loudness_db = transform(wav).item()
+    # calculate the gain needed to scale to the desired loudness level
+    delta_loudness = -loudness_headroom_db - input_loudness_db
+    gain = 10.0 ** (delta_loudness / 20.0)
+    output = gain * wav
+    if loudness_compressor:
+        output = torch.tanh(output)
+    assert output.isfinite().all(), (input_loudness_db, wav.pow(2).mean().sqrt())
+    return output
+def normalize(
+    S: np.ndarray,
+    *,
+    norm: Optional[float] = np.inf,
+    axis: Optional[int] = 0,
+    threshold: Optional[_FloatLike_co] = None,
+    fill: Optional[bool] = None,
+) -> np.ndarray:
+    """Normalize an array along a chosen axis.
+    """
+    # Avoid div-by-zero
+    if threshold is None:
+        threshold = tiny(S)
+    elif threshold <= 0:
+        raise ParameterError(f"threshold={threshold} must be strictly positive")
+    if fill not in [None, False, True]:
+        raise ParameterError(f"fill={fill} must be None or boolean")
+    if not np.isfinite(S).all():
+        raise ParameterError("Input must be finite")
+    # All norms only depend on magnitude, let's do that first
+    S = S.numpy()
+    mag = np.abs(S).astype(float)
+    # For max/min norms, filling with 1 works
+    fill_norm = 1
+    if norm is None:
+        return S
+    elif norm == np.inf:
+        length = np.max(mag, axis=axis, keepdims=True)
+    elif norm == -np.inf:
+        length = np.min(mag, axis=axis, keepdims=True)
+    elif norm == 0:
+        if fill is True:
+            raise ParameterError("Cannot normalize with norm=0 and fill=True")
+        length = np.sum(mag > 0, axis=axis, keepdims=True, dtype=mag.dtype)
+    elif np.issubdtype(type(norm), np.number) and norm > 0:
+        length = np.sum(mag**norm, axis=axis, keepdims=True) ** (1.0 / norm)
+        if axis is None:
+            fill_norm = mag.size ** (-1.0 / norm)
+        else:
+            fill_norm = mag.shape[axis] ** (-1.0 / norm)
+    else:
+        raise ParameterError(f"Unsupported norm: {repr(norm)}")
+    # indices where norm is below the threshold
+    small_idx = length < threshold
+    Snorm = np.empty_like(S)
+    if fill is None:
+        # Leave small indices un-normalized
+        length[small_idx] = 1.0
+        Snorm[:] = S / length
+    elif fill:
+        # If we have a non-zero fill value, we locate those entries by
+        # doing a nan-divide.
+        # If S was finite, then length is finite (except for small positions)
+        length[small_idx] = np.nan
+        Snorm[:] = S / length
+        Snorm[np.isnan(Snorm)] = fill_norm
+    else:
+        # Set small values to zero by doing an inf-divide.
+        # This is safe (by IEEE-754) as long as S is finite.
+        length[small_idx] = np.inf
+        Snorm[:] = S / length
+    return Snorm
+def normalize_audio(wav: torch.Tensor, normalize: bool = True,
+                    strategy: str = 'peak', peak_clip_headroom_db: float = 1,
+                    rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
+                    loudness_compressor: bool = False, log_clipping: bool = False,
+                    sample_rate: tp.Optional[int] = None,
+                    stem_name: tp.Optional[str] = None) -> torch.Tensor:
+    """Normalize the audio according to the prescribed strategy (see after).
+    Args:
+        wav (torch.Tensor): Audio data.
+        normalize (bool): if `True` (default), normalizes according to the prescribed
+            strategy (see after). If `False`, the strategy is only used in case clipping
+            would happen.
+        strategy (str): Can be either 'clip', 'peak', or 'rms'. Default is 'peak',
+            i.e. audio is normalized by its largest value. RMS normalizes by root-mean-square
+            with extra headroom to avoid clipping. 'clip' just clips.
+        peak_clip_headroom_db (float): Headroom in dB when doing 'peak' or 'clip' strategy.
+        rms_headroom_db (float): Headroom in dB when doing 'rms' strategy. This must be much larger
+            than the `peak_clip` one to avoid further clipping.
+        loudness_headroom_db (float): Target loudness for loudness normalization.
+        loudness_compressor (bool): If True, uses tanh based soft clipping.
+        log_clipping (bool): If True, basic logging on stderr when clipping still
+            occurs despite strategy (only for 'rms').
+        sample_rate (int): Sample rate for the audio data (required for loudness).
+        stem_name (str, optional): Stem name for clipping logging.
+    Returns:
+        torch.Tensor: Normalized audio.
+    """
+    scale_peak = 10 ** (-peak_clip_headroom_db / 20)
+    scale_rms = 10 ** (-rms_headroom_db / 20)
+    if strategy == 'peak':
+        rescaling = (scale_peak / wav.abs().max())
+        if normalize or rescaling < 1:
+            wav = wav * rescaling
+    elif strategy == 'clip':
+        wav = wav.clamp(-scale_peak, scale_peak)
+    elif strategy == 'rms':
+        mono = wav.mean(dim=0)
+        rescaling = scale_rms / mono.pow(2).mean().sqrt()
+        if normalize or rescaling < 1:
+            wav = wav * rescaling
+        _clip_wav(wav, log_clipping=log_clipping, stem_name=stem_name)
+    elif strategy == 'loudness':
+        assert sample_rate is not None, "Loudness normalization requires sample rate."
+        wav = normalize_loudness(wav, sample_rate, loudness_headroom_db, loudness_compressor)
+        _clip_wav(wav, log_clipping=log_clipping, stem_name=stem_name)
+    else:
+        assert wav.abs().max() < 1
+        assert strategy == '' or strategy == 'none', f"Unexpected strategy: '{strategy}'"
+    return wav
+def f32_pcm(wav: torch.Tensor) -> torch.Tensor:
+    """
+    Convert audio to float 32 bits PCM format.
+    Args:
+        wav (torch.tensor): Input wav tensor
+    Returns:
+        same wav in float32 PCM format
+    """
+    if wav.dtype.is_floating_point:
+        return wav
+    elif wav.dtype == torch.int16:
+        return wav.float() / 2**15
+    elif wav.dtype == torch.int32:
+        return wav.float() / 2**31
+    raise ValueError(f"Unsupported wav dtype: {wav.dtype}")
+def i16_pcm(wav: torch.Tensor) -> torch.Tensor:
+    """Convert audio to int 16 bits PCM format.
+    ..Warning:: There exist many formula for doing this conversion. None are perfect
+    due to the asymmetry of the int16 range. One either have possible clipping, DC offset,
+    or inconsistencies with f32_pcm. If the given wav doesn't have enough headroom,
+    it is possible that `i16_pcm(f32_pcm)) != Identity`.
+    Args:
+        wav (torch.tensor): Input wav tensor
+    Returns:
+        same wav in float16 PCM format
+    """
+    if wav.dtype.is_floating_point:
+        assert wav.abs().max() <= 1
+        candidate = (wav * 2 ** 15).round()
+        if candidate.max() >= 2 ** 15:  # clipping would occur
+            candidate = (wav * (2 ** 15 - 1)).round()
+        return candidate.short()
+    else:
+        assert wav.dtype == torch.int16
+        return wav
+def compress(wav: torch.Tensor, sr: int,
+             target_format: tp.Literal["mp3", "ogg", "flac"] = "mp3",
+             bitrate: str = "128k") -> tp.Tuple[torch.Tensor, int]:
+    """Convert audio wave form to a specified lossy format: mp3, ogg, flac
+    Args:
+        wav (torch.Tensor): Input wav tensor.
+        sr (int): Sampling rate.
+        target_format (str): Compression format (e.g., 'mp3').
+        bitrate (str): Bitrate for compression.
+    Returns:
+        Tuple of compressed WAV tensor and sampling rate.
+    """
+    # Extract the bit rate from string (e.g., '128k')
+    match = re.search(r"\d+(\.\d+)?", str(bitrate))
+    parsed_bitrate = float(match.group()) if match else None
+    assert parsed_bitrate, f"Invalid bitrate specified (got {parsed_bitrate})"
+    try:
+        # Create a virtual file instead of saving to disk
+        buffer = io.BytesIO()
+        torchaudio.save(
+            buffer, wav, sr, format=target_format, bits_per_sample=parsed_bitrate,
+        )
+        # Move to the beginning of the file
+        buffer.seek(0)
+        compressed_wav, sr = torchaudio.load(buffer)
+        return compressed_wav, sr
+    except RuntimeError:
+        logger.warning(
+            f"compression failed skipping compression: {format} {parsed_bitrate}"
+        )
+        return wav, sr
+def get_mp3(wav_tensor: torch.Tensor, sr: int, bitrate: str = "128k") -> torch.Tensor:
+    """Convert a batch of audio files to MP3 format, maintaining the original shape.
+    This function takes a batch of audio files represented as a PyTorch tensor, converts
+    them to MP3 format using the specified bitrate, and returns the batch in the same
+    shape as the input.
+    Args:
+        wav_tensor (torch.Tensor): Batch of audio files represented as a tensor.
+            Shape should be (batch_size, channels, length).
+        sr (int): Sampling rate of the audio.
+        bitrate (str): Bitrate for MP3 conversion, default is '128k'.
+    Returns:
+        torch.Tensor: Batch of audio files converted to MP3 format, with the same
+            shape as the input tensor.
+    """
+    device = wav_tensor.device
+    batch_size, channels, original_length = wav_tensor.shape
+    # Flatten tensor for conversion and move to CPU
+    wav_tensor_flat = wav_tensor.view(1, -1).cpu()
+    # Convert to MP3 format with specified bitrate
+    wav_tensor_flat, _ = compress(wav_tensor_flat, sr, bitrate=bitrate)
+    # Reshape back to original batch format and trim or pad if necessary
+    wav_tensor = wav_tensor_flat.view(batch_size, channels, -1)
+    compressed_length = wav_tensor.shape[-1]
+    if compressed_length > original_length:
+        wav_tensor = wav_tensor[:, :, :original_length]  # Trim excess frames
+    elif compressed_length < original_length:
+        padding = torch.zeros(
+            batch_size, channels, original_length - compressed_length, device=device
+        )
+        wav_tensor = torch.cat((wav_tensor, padding), dim=-1)  # Pad with zeros
+    # Move tensor back to the original device
+    return wav_tensor.to(device)
+def get_aac(
+    wav_tensor: torch.Tensor,
+    sr: int,
+    bitrate: str = "128k",
+    lowpass_freq: tp.Optional[int] = None,
+) -> torch.Tensor:
+    """Converts a batch of audio tensors to AAC format and then back to tensors.
+    This function first saves the input tensor batch as WAV files, then uses FFmpeg to convert
+    these WAV files to AAC format. Finally, it loads the AAC files back into tensors.
+    Args:
+        wav_tensor (torch.Tensor): A batch of audio files represented as a tensor.
+                                   Shape should be (batch_size, channels, length).
+        sr (int): Sampling rate of the audio.
+        bitrate (str): Bitrate for AAC conversion, default is '128k'.
+        lowpass_freq (Optional[int]): Frequency for a low-pass filter. If None, no filter is applied.
+    Returns:
+        torch.Tensor: Batch of audio files converted to AAC and back, with the same
+                      shape as the input tensor.
+    """
+    import tempfile
+    import subprocess
+    device = wav_tensor.device
+    batch_size, channels, original_length = wav_tensor.shape
+    # Parse the bitrate value from the string
+    match = re.search(r"\d+(\.\d+)?", bitrate)
+    parsed_bitrate = (
+        match.group() if match else "128"
+    )  # Default to 128 if parsing fails
+    # Flatten tensor for conversion and move to CPU
+    wav_tensor_flat = wav_tensor.view(1, -1).cpu()
+    with tempfile.NamedTemporaryFile(
+        suffix=".wav"
+    ) as f_in, tempfile.NamedTemporaryFile(suffix=".aac") as f_out:
+        input_path, output_path = f_in.name, f_out.name
+        # Save the tensor as a WAV file
+        torchaudio.save(input_path, wav_tensor_flat, sr, backend="ffmpeg")
+        # Prepare FFmpeg command for AAC conversion
+        command = [
+            "ffmpeg",
+            "-y",
+            "-i",
+            input_path,
+            "-ar",
+            str(sr),
+            "-b:a",
+            f"{parsed_bitrate}k",
+            "-c:a",
+            "aac",
+        ]
+        if lowpass_freq is not None:
+            command += ["-cutoff", str(lowpass_freq)]
+        command.append(output_path)
+        try:
+            # Run FFmpeg and suppress output
+            subprocess.run(command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+            # Load the AAC audio back into a tensor
+            aac_tensor, _ = torchaudio.load(output_path, backend="ffmpeg")
+        except Exception as exc:
+            raise RuntimeError(
+                "Failed to run command " ".join(command)} "
+                "(Often this means ffmpeg is not installed or the encoder is not supported, "
+                "make sure you installed an older version ffmpeg<5)"
+            ) from exc
+    original_length_flat = batch_size * channels * original_length
+    compressed_length_flat = aac_tensor.shape[-1]
+    # Trim excess frames
+    if compressed_length_flat > original_length_flat:
+        aac_tensor = aac_tensor[:, :original_length_flat]
+    # Pad the shortedn frames
+    elif compressed_length_flat < original_length_flat:
+        padding = torch.zeros(
+            1, original_length_flat - compressed_length_flat, device=device
+        )
+        aac_tensor = torch.cat((aac_tensor, padding), dim=-1)
+    # Reshape and adjust length to match original tensor
+    wav_tensor = aac_tensor.view(batch_size, channels, -1)
+    compressed_length = wav_tensor.shape[-1]
+    assert compressed_length == original_length, (
+        "AAC-compressed audio does not have the same frames as original one. "
+        "One reason can be ffmpeg is not  installed and used as proper backed "
+        "for torchaudio, or the AAC encoder is not correct. Run "
+        "`torchaudio.utils.ffmpeg_utils.get_audio_encoders()` and make sure we see entry for"
+        "AAC in the output."
+    )
+    return wav_tensor.to(device)

inspiremusic/utils/binary.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""Raw binary format for Encodec compressed audio. Actual compression API is in `encodec.compress`."""
+import io
+import json
+import struct
+import typing as tp
+# format is `ECDC` magic code, followed by the header size as uint32.
+# Then an uint8 indicates the protocol version (0.)
+# The header is then provided as json and should contain all required
+# informations for decoding. A raw stream of bytes is then provided
+# and should be interpretable using the json header.
+_encodec_header_struct = struct.Struct('!4sBI')
+_ENCODEC_MAGIC = b'ECDC'
+def write_ecdc_header(fo: tp.IO[bytes], metadata: tp.Any):
+    meta_dumped = json.dumps(metadata).encode('utf-8')
+    version = 0
+    header = _encodec_header_struct.pack(_ENCODEC_MAGIC, version,
+                                         len(meta_dumped))
+    fo.write(header)
+    fo.write(meta_dumped)
+    fo.flush()
+def _read_exactly(fo: tp.IO[bytes], size: int) -> bytes:
+    buf = b""
+    while len(buf) < size:
+        new_buf = fo.read(size)
+        if not new_buf:
+            raise EOFError("Impossible to read enough data from the stream, "
+                           f"{size} bytes remaining.")
+        buf += new_buf
+        size -= len(new_buf)
+    return buf
+def read_ecdc_header(fo: tp.IO[bytes]):
+    header_bytes = _read_exactly(fo, _encodec_header_struct.size)
+    magic, version, meta_size = _encodec_header_struct.unpack(header_bytes)
+    if magic != _ENCODEC_MAGIC:
+        raise ValueError("File is not in ECDC format.")
+    if version != 0:
+        raise ValueError("Version not supported.")
+    meta_bytes = _read_exactly(fo, meta_size)
+    return json.loads(meta_bytes.decode('utf-8'))
+class BitPacker:
+    """Simple bit packer to handle ints with a non standard width, e.g. 10 bits.
+    Note that for some bandwidth (1.5, 3), the codebook representation
+    will not cover an integer number of bytes.
+    Args:
+        bits (int): number of bits per value that will be pushed.
+        fo (IO[bytes]): file-object to push the bytes to.
+    """
+    def __init__(self, bits: int, fo: tp.IO[bytes]):
+        self._current_value = 0
+        self._current_bits = 0
+        self.bits = bits
+        self.fo = fo
+    def push(self, value: int):
+        """Push a new value to the stream. This will immediately
+        write as many uint8 as possible to the underlying file-object."""
+        self._current_value += (value << self._current_bits)
+        self._current_bits += self.bits
+        while self._current_bits >= 8:
+            lower_8bits = self._current_value & 0xff
+            self._current_bits -= 8
+            self._current_value >>= 8
+            self.fo.write(bytes([lower_8bits]))
+    def flush(self):
+        """Flushes the remaining partial uint8, call this at the end
+        of the stream to encode."""
+        if self._current_bits:
+            self.fo.write(bytes([self._current_value]))
+            self._current_value = 0
+            self._current_bits = 0
+        self.fo.flush()
+class BitUnpacker:
+    """BitUnpacker does the opposite of `BitPacker`.
+    Args:
+        bits (int): number of bits of the values to decode.
+        fo (IO[bytes]): file-object to push the bytes to.
+        """
+    def __init__(self, bits: int, fo: tp.IO[bytes]):
+        self.bits = bits
+        self.fo = fo
+        self._mask = (1 << bits) - 1
+        self._current_value = 0
+        self._current_bits = 0
+    def pull(self) -> tp.Optional[int]:
+        """
+        Pull a single value from the stream, potentially reading some
+        extra bytes from the underlying file-object.
+        Returns `None` when reaching the end of the stream.
+        """
+        while self._current_bits < self.bits:
+            buf = self.fo.read(1)
+            if not buf:
+                return None
+            character = buf[0]
+            self._current_value += character << self._current_bits
+            self._current_bits += 8
+        out = self._current_value & self._mask
+        self._current_value >>= self.bits
+        self._current_bits -= self.bits
+        return out
+def test():
+    import torch
+    torch.manual_seed(1234)
+    for rep in range(4):
+        length: int = torch.randint(10, 2_000, (1, )).item()
+        bits: int = torch.randint(1, 16, (1, )).item()
+        tokens: tp.List[int] = torch.randint(2**bits, (length, )).tolist()
+        rebuilt: tp.List[int] = []
+        buf = io.BytesIO()
+        packer = BitPacker(bits, buf)
+        for token in tokens:
+            packer.push(token)
+        packer.flush()
+        buf.seek(0)
+        unpacker = BitUnpacker(bits, buf)
+        while True:
+            value = unpacker.pull()
+            if value is None:
+                break
+            rebuilt.append(value)
+        assert len(rebuilt) >= len(tokens), (len(rebuilt), len(tokens))
+        # The flushing mechanism might lead to "ghost" values at the end of the stream.
+        assert len(rebuilt) <= len(tokens) + 8 // bits, (len(rebuilt),
+                                                         len(tokens), bits)
+        for idx, (a, b) in enumerate(zip(tokens, rebuilt)):
+            assert a == b, (idx, a, b)
+if __name__ == '__main__':
+    test()