File size: 9,272 Bytes
b5adbf2
 
 
 
 
 
 
 
 
 
 
 
 
 
b93c24a
8c1251f
 
0ede6a0
3460b5c
ce80c6c
0a0750e
693c129
 
bacc813
37b6092
b5adbf2
 
 
693c129
 
 
 
 
 
 
 
0ae2d37
372eac8
b3d320b
693c129
590d020
 
d89f1ee
 
 
 
 
37b6092
84d3675
37b6092
 
 
b93c24a
f107ce2
9af664a
1fc806b
9ef0154
 
 
 
9af664a
 
 
 
b93c24a
 
 
 
 
 
 
 
9af664a
b93c24a
 
1343135
b93c24a
7d94b32
4e5c199
b93c24a
b3d320b
37b6092
b93c24a
1343135
 
 
b93c24a
1343135
 
 
1fc806b
1343135
37b6092
b93c24a
 
84d3675
 
 
 
 
b3d320b
84d3675
 
 
0e76787
b93c24a
 
 
 
 
1343135
 
 
b93c24a
1343135
 
 
37a3fc6
1343135
b93c24a
 
 
 
 
1343135
 
b93c24a
1343135
 
b93c24a
1343135
 
b93c24a
 
 
8ae3c92
698dade
 
6487b24
9af664a
 
698dade
 
 
8ae3c92
a55649b
b93c24a
8ae3c92
9af664a
 
b93c24a
 
 
d896a67
 
 
90d34b5
520c9cc
7ea60b9
fb52c82
d896a67
040dc24
d896a67
 
 
b3d320b
 
 
d896a67
 
 
 
 
6ac4eba
d896a67
 
 
b3d320b
 
 
 
 
 
9f69f20
416b764
b3d320b
 
ed2f242
b3d320b
 
 
 
 
 
 
ed2f242
b3d320b
 
 
 
 
 
73dfce8
d896a67
 
 
b3d320b
 
4e0151a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# Copyright (c) 2024 Alibaba Inc (authors: Chong Zhang)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

os.system('nvidia-smi')
os.system('apt update -y && apt-get install -y apt-utils && apt install -y unzip')
os.environ['PYTHONPATH'] = 'third_party/Matcha-TTS'
os.system('mkdir pretrained_models && cd pretrained_models && git clone https://huggingface.co/FunAudioLLM/InspireMusic-Base.git &&git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-Long.git &&git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B.git &&git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-24kHz.git &&git clone https://huggingface.co/FunAudioLLM/InspireMusic-Base-24kHz.git && for i in InspireMusic-Base InspireMusic-Base-24kHz InspireMusic-1.5B InspireMusic-1.5B-24kHz InspireMusic-1.5B-Long; do sed -i -e "s/\.\.\/\.\.\///g" ${i}/inspiremusic.yaml; done && cd ..')

import sys
import torch
print(torch.backends.cudnn.version())

ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))

import spaces
import gradio as gr
from inspiremusic.cli.inference import InspireMusicUnified, set_env_variables
import torchaudio
import datetime
import hashlib
import importlib

MODELS = ["InspireMusic-1.5B-Long", "InspireMusic-1.5B", "InspireMusic-Base", "InspireMusic-1.5B-24kHz", "InspireMusic-Base-24kHz"]
AUDIO_PROMPT_DIR = "demo/audio_prompts"
OUTPUT_AUDIO_DIR = "demo/outputs"

DEMO_TEXT_PROMPTS = ["Jazz music with drum beats.",
					 "A captivating classical piano performance, this piece exudes a dynamic and intense atmosphere, showcasing intricate and expressive instrumental artistry.",
					 "A soothing instrumental piece blending elements of light music and pop, featuring a gentle guitar rendition. The overall feel is serene and reflective, likely instrumental with no vocals.",
					 "The instrumental rock piece features dynamic oscillations and wave-like progressions, creating an immersive and energetic atmosphere. The music is purely instrumental, with no vocals, and it blends elements of rock and post-rock for a powerful and evocative experience.",
					 "The classical instrumental piece exudes a haunting and evocative atmosphere, characterized by its intricate guitar work and profound emotional depth.",
					 "Experience a dynamic blend of instrumental electronic music with futuristic house vibes, featuring energetic beats and a captivating rhythm. The tracks are likely instrumental, focusing on the immersive soundscapes rather than vocal performances."]

def generate_filename():
	hash_object = hashlib.sha256(str(int(datetime.datetime.now().timestamp())).encode())
	hash_string = hash_object.hexdigest()
	return hash_string

def get_args(
		task, text="", audio=None, model_name="InspireMusic-Base",
		chorus="intro",
		output_sample_rate=48000, max_generate_audio_seconds=30.0, time_start = 0.0, time_end=30.0, trim=False):

	if "24kHz" in model_name:
		output_sample_rate = 24000

	if output_sample_rate == 24000:
		fast = True
	else:
		fast = False
	# This function constructs the arguments required for InspireMusic
	args = {
		"task"                      : task,
		"text"                      : text,
		"audio_prompt"              : audio,
		"model_name"                : model_name,
		"chorus"                    : chorus,
		"fast"                      : fast,
		"fade_out"                  : True,
		"trim"                      : trim,
		"output_sample_rate"        : output_sample_rate,
		"min_generate_audio_seconds": 10.0,
		"max_generate_audio_seconds": max_generate_audio_seconds,
		"max_audio_prompt_length": 5.0,
		"model_dir"                 : os.path.join("pretrained_models",
												   model_name),
		"result_dir"                : OUTPUT_AUDIO_DIR,
		"output_fn"                 : generate_filename(),
		"format"                    : "wav",
		"time_start" : time_start,
		"time_end": time_end,
		"fade_out_duration": 1.0,
	}

	if args["time_start"] is None:
		args["time_start"] = 0.0
	args["time_end"] = args["time_start"] + args["max_generate_audio_seconds"]

	print(args)
	return args


def trim_audio(audio_file, cut_seconds=5):
	audio, sr = torchaudio.load(audio_file)
	num_samples = cut_seconds * sr
	cutted_audio = audio[:, :num_samples]
	output_path = os.path.join(AUDIO_PROMPT_DIR, "audio_prompt_" + generate_filename() + ".wav")
	torchaudio.save(output_path, cutted_audio, sr)
	return output_path

@spaces.GPU()
def music_generation(args):
	set_env_variables()
	model = InspireMusicUnified(
			model_name=args["model_name"],
			model_dir=args["model_dir"],
			min_generate_audio_seconds=args["min_generate_audio_seconds"],
			max_generate_audio_seconds=args["max_generate_audio_seconds"],
			sample_rate=24000,
			output_sample_rate=args["output_sample_rate"],
			load_jit=True,
			load_onnx=False,
			fast=args["fast"],
			result_dir=args["result_dir"])

	output_path = model.inference(
			task=args["task"],
			text=args["text"],
			audio_prompt=args["audio_prompt"],
			chorus=args["chorus"],
			time_start=args["time_start"],
			time_end=args["time_end"],
			output_fn=args["output_fn"],
			max_audio_prompt_length=args["max_audio_prompt_length"],
			fade_out_duration=args["fade_out_duration"],
			output_format=args["format"],
			fade_out_mode=args["fade_out"],
			trim=args["trim"])
	return output_path


def demo_inspiremusic_t2m(text, model_name, chorus,
					 output_sample_rate, max_generate_audio_seconds):
	args = get_args(
			task='text-to-music', text=text, audio=None,
			model_name=model_name, chorus=chorus,
			output_sample_rate=output_sample_rate,
			max_generate_audio_seconds=max_generate_audio_seconds)
	return music_generation(args)

def demo_inspiremusic_con(text, audio, model_name, chorus,
					 output_sample_rate, max_generate_audio_seconds):
	args = get_args(
			task='continuation', text=text, audio=trim_audio(audio, cut_seconds=5),
			model_name=model_name, chorus=chorus,
			output_sample_rate=output_sample_rate,
			max_generate_audio_seconds=max_generate_audio_seconds)
	return music_generation(args)

def main():
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
		gr.Markdown("""
		# InspireMusic
		- Support music generation tasks with long-form and high audio quality, sampling rates up to 48kHz. 
		- Github: https://github.com/FunAudioLLM/InspireMusic/
		- Available music generation models: [InspireMusic-1.5B-Long](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-Long), [InspireMusic-1.5B](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B), [InspireMusic-Base](https://huggingface.co/FunAudioLLM/InspireMusic-Base), [InspireMusic-1.5B-24kHz](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-24kHz), [InspireMusic-Base-24kHz](https://huggingface.co/FunAudioLLM/InspireMusic-Base-24kHz). Both on Huggingface and ModelScope.
		- Currently only support English text prompts.
		- This page is for demo purpose, if you want to generate long-form audio, e.g., 5mins, please try to deploy locally. Thank you for your support.
		""")

		with gr.Row(equal_height=True):
			model_name = gr.Dropdown(
					MODELS, label="Select Model Name",
					value="InspireMusic-1.5B-Long")
			chorus = gr.Dropdown(["intro", "verse", "chorus", "outro"],
								 label="Chorus Mode", value="intro")
			output_sample_rate = gr.Dropdown([48000, 24000],
											 label="Output Audio Sample Rate (Hz)",
											 value=48000)
			max_generate_audio_seconds = gr.Slider(10, 300,
												   label="Generate Audio Length (s)",
												   value=30)

		with gr.Row(equal_height=True):
			text_input = gr.Textbox(label="Input Text (For Text-to-Music Task)",
									value="Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance.")

			audio_input = gr.Audio(
				label="Input Audio Prompt (For Music Continuation Task)",
				type="filepath")
		music_output = gr.Audio(label="Generated Music", type="filepath", autoplay=True, show_download_button = True)

		with gr.Row():
			button = gr.Button("Start Text-to-Music Task")
			button.click(demo_inspiremusic_t2m,
						 inputs=[text_input, model_name,
								 chorus,
								 output_sample_rate,
								 max_generate_audio_seconds],
						 outputs=music_output)

			generate_button = gr.Button("Start Music Continuation Task")
			generate_button.click(demo_inspiremusic_con,
								  inputs=[text_input, audio_input, model_name,
										  chorus,
										  output_sample_rate,
										  max_generate_audio_seconds],
								  outputs=music_output)
		t2m_examples = gr.Examples(examples=DEMO_TEXT_PROMPTS, inputs=[text_input])
	demo.launch()

if __name__ == '__main__':
	os.makedirs(AUDIO_PROMPT_DIR, exist_ok=True)
	os.makedirs(OUTPUT_AUDIO_DIR, exist_ok=True)
	main()