Spaces:
Running
on
Zero
Running
on
Zero
chong.zhang
commited on
Commit
·
c72a6e3
1
Parent(s):
3d0f730
update
Browse files
app.py
CHANGED
@@ -13,201 +13,197 @@
|
|
13 |
# limitations under the License.
|
14 |
|
15 |
import os
|
16 |
-
import sys
|
17 |
-
import torch
|
18 |
-
import gradio as gr
|
19 |
-
import torchaudio
|
20 |
-
import datetime, hashlib
|
21 |
-
from inspiremusic.cli.inference import InspireMusicUnified, set_env_variables
|
22 |
|
23 |
-
# Prepare environment and model files (unchanged from original)
|
24 |
os.system('nvidia-smi')
|
25 |
os.system('apt update -y && apt-get install -y apt-utils && apt install -y unzip')
|
26 |
os.environ['PYTHONPATH'] = 'third_party/Matcha-TTS'
|
27 |
-
os.system(
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
'git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B.git && '
|
32 |
-
'git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-24kHz.git && '
|
33 |
-
'git clone https://huggingface.co/FunAudioLLM/InspireMusic-Base-24kHz.git && '
|
34 |
-
# Fix paths in YAML files
|
35 |
-
'for i in InspireMusic-Base InspireMusic-Base-24kHz InspireMusic-1.5B InspireMusic-1.5B-24kHz InspireMusic-1.5B-Long; '
|
36 |
-
'do sed -i -e "s/..\/..\///g" ${i}/inspiremusic.yaml; done && cd ..'
|
37 |
-
)
|
38 |
print(torch.backends.cudnn.version())
|
|
|
39 |
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
40 |
-
sys.path.append(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
-
|
43 |
-
MODELS = ["InspireMusic-1.5B-Long", "InspireMusic-1.5B", "InspireMusic-Base",
|
44 |
-
"InspireMusic-1.5B-24kHz", "InspireMusic-Base-24kHz"]
|
45 |
AUDIO_PROMPT_DIR = "demo/audio_prompts"
|
46 |
OUTPUT_AUDIO_DIR = "demo/outputs"
|
47 |
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
# Load the default model into GPU memory
|
56 |
-
current_model_name = "InspireMusic-1.5B-Long" # default selected model in the UI
|
57 |
-
loaded_model = InspireMusicUnified(
|
58 |
-
model_name=current_model_name,
|
59 |
-
model_dir=os.path.join("pretrained_models", current_model_name),
|
60 |
-
min_generate_audio_seconds=10.0,
|
61 |
-
max_generate_audio_seconds=30.0,
|
62 |
-
sample_rate=24000,
|
63 |
-
output_sample_rate=48000, # 48kHz output for default (non-24kHz model)
|
64 |
-
load_jit=True,
|
65 |
-
load_onnx=False,
|
66 |
-
fast=False, # False because 48000 Hz output (not fast mode)
|
67 |
-
result_dir=OUTPUT_AUDIO_DIR
|
68 |
-
)
|
69 |
-
# (The model is now loaded on the GPU and ready for reuse)
|
70 |
|
71 |
def generate_filename():
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
del loaded_model
|
120 |
-
torch.cuda.empty_cache() # free cached memory​:contentReference[oaicite:10]{index=10}
|
121 |
-
# Load the requested model into GPU memory
|
122 |
-
loaded_model = InspireMusicUnified(
|
123 |
-
model_name=requested_model,
|
124 |
-
model_dir=args["model_dir"],
|
125 |
-
min_generate_audio_seconds=args["min_generate_audio_seconds"],
|
126 |
-
max_generate_audio_seconds=args["max_generate_audio_seconds"],
|
127 |
-
sample_rate=24000,
|
128 |
-
output_sample_rate=args["output_sample_rate"],
|
129 |
-
load_jit=True,
|
130 |
-
load_onnx=False,
|
131 |
-
fast=args["fast"],
|
132 |
-
result_dir=args["result_dir"]
|
133 |
-
)
|
134 |
-
current_model_name = requested_model
|
135 |
-
# Perform inference with the loaded model (no gradient computation needed)
|
136 |
-
with torch.no_grad(): # disable grad to save memory​:contentReference[oaicite:11]{index=11}​:contentReference[oaicite:12]{index=12}
|
137 |
-
output_path = loaded_model.inference(
|
138 |
-
task=args["task"],
|
139 |
-
text=args["text"],
|
140 |
-
audio_prompt=args["audio_prompt"],
|
141 |
-
chorus=args["chorus"],
|
142 |
-
time_start=args["time_start"],
|
143 |
-
time_end=args["time_end"],
|
144 |
-
output_fn=args["output_fn"],
|
145 |
-
max_audio_prompt_length=args["max_audio_prompt_length"],
|
146 |
-
fade_out_duration=args["fade_out_duration"],
|
147 |
-
output_format=args["format"],
|
148 |
-
fade_out_mode=args["fade_out"],
|
149 |
-
trim=args["trim"]
|
150 |
-
)
|
151 |
-
return output_path
|
152 |
-
|
153 |
-
# Demo helper functions (using music_generation internally)
|
154 |
-
def demo_inspiremusic_t2m(text, model_name, chorus, output_sample_rate, max_generate_audio_seconds):
|
155 |
-
args = get_args(task="text-to-music", text=text, audio=None,
|
156 |
-
model_name=model_name, chorus=chorus,
|
157 |
-
output_sample_rate=output_sample_rate,
|
158 |
-
max_generate_audio_seconds=max_generate_audio_seconds)
|
159 |
-
return music_generation(args)
|
160 |
-
|
161 |
-
def demo_inspiremusic_con(text, audio, model_name, chorus, output_sample_rate, max_generate_audio_seconds):
|
162 |
-
# Trim the audio prompt to 5 seconds and use it for continuation
|
163 |
-
trimmed_audio = trim_audio(audio, cut_seconds=5)
|
164 |
-
args = get_args(task="continuation", text=text, audio=trimmed_audio,
|
165 |
-
model_name=model_name, chorus=chorus,
|
166 |
-
output_sample_rate=output_sample_rate,
|
167 |
-
max_generate_audio_seconds=max_generate_audio_seconds)
|
168 |
-
return music_generation(args)
|
169 |
|
170 |
def trim_audio(audio_file, cut_seconds=5):
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
|
179 |
def main():
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
# limitations under the License.
|
14 |
|
15 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
|
|
17 |
os.system('nvidia-smi')
|
18 |
os.system('apt update -y && apt-get install -y apt-utils && apt install -y unzip')
|
19 |
os.environ['PYTHONPATH'] = 'third_party/Matcha-TTS'
|
20 |
+
os.system('mkdir pretrained_models && cd pretrained_models && git clone https://huggingface.co/FunAudioLLM/InspireMusic-Base.git &&git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-Long.git &&git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B.git &&git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-24kHz.git &&git clone https://huggingface.co/FunAudioLLM/InspireMusic-Base-24kHz.git && for i in InspireMusic-Base InspireMusic-Base-24kHz InspireMusic-1.5B InspireMusic-1.5B-24kHz InspireMusic-1.5B-Long; do sed -i -e "s/\.\.\/\.\.\///g" ${i}/inspiremusic.yaml; done && cd ..')
|
21 |
+
|
22 |
+
import sys
|
23 |
+
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
print(torch.backends.cudnn.version())
|
25 |
+
|
26 |
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
27 |
+
sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
|
28 |
+
|
29 |
+
import spaces
|
30 |
+
import gradio as gr
|
31 |
+
from inspiremusic.cli.inference import InspireMusicUnified, set_env_variables
|
32 |
+
import torchaudio
|
33 |
+
import datetime
|
34 |
+
import hashlib
|
35 |
+
import importlib
|
36 |
|
37 |
+
MODELS = ["InspireMusic-1.5B-Long", "InspireMusic-1.5B", "InspireMusic-Base", "InspireMusic-1.5B-24kHz", "InspireMusic-Base-24kHz"]
|
|
|
|
|
38 |
AUDIO_PROMPT_DIR = "demo/audio_prompts"
|
39 |
OUTPUT_AUDIO_DIR = "demo/outputs"
|
40 |
|
41 |
+
DEMO_TEXT_PROMPTS = ["Jazz music with drum beats.",
|
42 |
+
"A captivating classical piano performance, this piece exudes a dynamic and intense atmosphere, showcasing intricate and expressive instrumental artistry.",
|
43 |
+
"A soothing instrumental piece blending elements of light music and pop, featuring a gentle guitar rendition. The overall feel is serene and reflective, likely instrumental with no vocals.",
|
44 |
+
"The instrumental rock piece features dynamic oscillations and wave-like progressions, creating an immersive and energetic atmosphere. The music is purely instrumental, with no vocals, and it blends elements of rock and post-rock for a powerful and evocative experience.",
|
45 |
+
"The classical instrumental piece exudes a haunting and evocative atmosphere, characterized by its intricate guitar work and profound emotional depth.",
|
46 |
+
"Experience a dynamic blend of instrumental electronic music with futuristic house vibes, featuring energetic beats and a captivating rhythm. The tracks are likely instrumental, focusing on the immersive soundscapes rather than vocal performances."]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
def generate_filename():
|
49 |
+
hash_object = hashlib.sha256(str(int(datetime.datetime.now().timestamp())).encode())
|
50 |
+
hash_string = hash_object.hexdigest()
|
51 |
+
return hash_string
|
52 |
+
|
53 |
+
def get_args(
|
54 |
+
task, text="", audio=None, model_name="InspireMusic-Base",
|
55 |
+
chorus="intro",
|
56 |
+
output_sample_rate=48000, max_generate_audio_seconds=30.0, time_start = 0.0, time_end=30.0, trim=False):
|
57 |
+
|
58 |
+
if "24kHz" in model_name:
|
59 |
+
output_sample_rate = 24000
|
60 |
+
|
61 |
+
if output_sample_rate == 24000:
|
62 |
+
fast = True
|
63 |
+
else:
|
64 |
+
fast = False
|
65 |
+
# This function constructs the arguments required for InspireMusic
|
66 |
+
args = {
|
67 |
+
"task" : task,
|
68 |
+
"text" : text,
|
69 |
+
"audio_prompt" : audio,
|
70 |
+
"model_name" : model_name,
|
71 |
+
"chorus" : chorus,
|
72 |
+
"fast" : fast,
|
73 |
+
"fade_out" : True,
|
74 |
+
"trim" : trim,
|
75 |
+
"output_sample_rate" : output_sample_rate,
|
76 |
+
"min_generate_audio_seconds": 10.0,
|
77 |
+
"max_generate_audio_seconds": max_generate_audio_seconds,
|
78 |
+
"max_audio_prompt_length": 5.0,
|
79 |
+
"model_dir" : os.path.join("pretrained_models",
|
80 |
+
model_name),
|
81 |
+
"result_dir" : OUTPUT_AUDIO_DIR,
|
82 |
+
"output_fn" : generate_filename(),
|
83 |
+
"format" : "wav",
|
84 |
+
"time_start" : time_start,
|
85 |
+
"time_end": time_end,
|
86 |
+
"fade_out_duration": 1.0,
|
87 |
+
}
|
88 |
+
|
89 |
+
if args["time_start"] is None:
|
90 |
+
args["time_start"] = 0.0
|
91 |
+
args["time_end"] = args["time_start"] + args["max_generate_audio_seconds"]
|
92 |
+
|
93 |
+
print(args)
|
94 |
+
return args
|
95 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
def trim_audio(audio_file, cut_seconds=5):
|
98 |
+
audio, sr = torchaudio.load(audio_file)
|
99 |
+
num_samples = cut_seconds * sr
|
100 |
+
cutted_audio = audio[:, :num_samples]
|
101 |
+
output_path = os.path.join(AUDIO_PROMPT_DIR, "audio_prompt_" + generate_filename() + ".wav")
|
102 |
+
torchaudio.save(output_path, cutted_audio, sr)
|
103 |
+
return output_path
|
104 |
+
|
105 |
+
@spaces.GPU(duration=120)
|
106 |
+
def music_generation(args):
|
107 |
+
set_env_variables()
|
108 |
+
model = InspireMusicUnified(
|
109 |
+
model_name=args["model_name"],
|
110 |
+
model_dir=args["model_dir"],
|
111 |
+
min_generate_audio_seconds=args["min_generate_audio_seconds"],
|
112 |
+
max_generate_audio_seconds=args["max_generate_audio_seconds"],
|
113 |
+
sample_rate=24000,
|
114 |
+
output_sample_rate=args["output_sample_rate"],
|
115 |
+
load_jit=True,
|
116 |
+
load_onnx=False,
|
117 |
+
fast=args["fast"],
|
118 |
+
result_dir=args["result_dir"])
|
119 |
+
|
120 |
+
output_path = model.inference(
|
121 |
+
task=args["task"],
|
122 |
+
text=args["text"],
|
123 |
+
audio_prompt=args["audio_prompt"],
|
124 |
+
chorus=args["chorus"],
|
125 |
+
time_start=args["time_start"],
|
126 |
+
time_end=args["time_end"],
|
127 |
+
output_fn=args["output_fn"],
|
128 |
+
max_audio_prompt_length=args["max_audio_prompt_length"],
|
129 |
+
fade_out_duration=args["fade_out_duration"],
|
130 |
+
output_format=args["format"],
|
131 |
+
fade_out_mode=args["fade_out"],
|
132 |
+
trim=args["trim"])
|
133 |
+
return output_path
|
134 |
+
|
135 |
+
|
136 |
+
def demo_inspiremusic_t2m(text, model_name, chorus,
|
137 |
+
output_sample_rate, max_generate_audio_seconds):
|
138 |
+
args = get_args(
|
139 |
+
task='text-to-music', text=text, audio=None,
|
140 |
+
model_name=model_name, chorus=chorus,
|
141 |
+
output_sample_rate=output_sample_rate,
|
142 |
+
max_generate_audio_seconds=max_generate_audio_seconds)
|
143 |
+
return music_generation(args)
|
144 |
+
|
145 |
+
def demo_inspiremusic_con(text, audio, model_name, chorus,
|
146 |
+
output_sample_rate, max_generate_audio_seconds):
|
147 |
+
args = get_args(
|
148 |
+
task='continuation', text=text, audio=trim_audio(audio, cut_seconds=5),
|
149 |
+
model_name=model_name, chorus=chorus,
|
150 |
+
output_sample_rate=output_sample_rate,
|
151 |
+
max_generate_audio_seconds=max_generate_audio_seconds)
|
152 |
+
return music_generation(args)
|
153 |
|
154 |
def main():
|
155 |
+
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
156 |
+
gr.Markdown("""
|
157 |
+
# InspireMusic
|
158 |
+
- Support music generation tasks with long-form and high audio quality, sampling rates up to 48kHz.
|
159 |
+
- Github: https://github.com/FunAudioLLM/InspireMusic/ | ModelScope Studio: https://modelscope.cn/studios/iic/InspireMusic
|
160 |
+
- Available music generation models: [InspireMusic-1.5B-Long](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-Long), [InspireMusic-1.5B](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B), [InspireMusic-Base](https://huggingface.co/FunAudioLLM/InspireMusic-Base), [InspireMusic-1.5B-24kHz](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-24kHz), [InspireMusic-Base-24kHz](https://huggingface.co/FunAudioLLM/InspireMusic-Base-24kHz). Both on Huggingface and ModelScope.
|
161 |
+
- Currently only support English text prompts.
|
162 |
+
- This page is for demo purpose, if you want to generate long-form audio, e.g., 5mins, please try to deploy locally. Thank you for your support.
|
163 |
+
""")
|
164 |
+
|
165 |
+
with gr.Row(equal_height=True):
|
166 |
+
model_name = gr.Dropdown(
|
167 |
+
MODELS, label="Select Model Name",
|
168 |
+
value="InspireMusic-1.5B-Long")
|
169 |
+
chorus = gr.Dropdown(["intro", "verse", "chorus", "outro"],
|
170 |
+
label="Chorus Mode", value="intro")
|
171 |
+
output_sample_rate = gr.Dropdown([48000, 24000],
|
172 |
+
label="Output Audio Sample Rate (Hz)",
|
173 |
+
value=48000)
|
174 |
+
max_generate_audio_seconds = gr.Slider(10, 300,
|
175 |
+
label="Generate Audio Length (s)",
|
176 |
+
value=30)
|
177 |
+
|
178 |
+
with gr.Row(equal_height=True):
|
179 |
+
text_input = gr.Textbox(label="Input Text (For Text-to-Music Task)",
|
180 |
+
value="Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance.")
|
181 |
+
|
182 |
+
audio_input = gr.Audio(
|
183 |
+
label="Input Audio Prompt (For Music Continuation Task)",
|
184 |
+
type="filepath")
|
185 |
+
music_output = gr.Audio(label="Generated Music", type="filepath", autoplay=True, show_download_button = True)
|
186 |
+
|
187 |
+
with gr.Row():
|
188 |
+
button = gr.Button("Start Text-to-Music Task")
|
189 |
+
button.click(demo_inspiremusic_t2m,
|
190 |
+
inputs=[text_input, model_name,
|
191 |
+
chorus,
|
192 |
+
output_sample_rate,
|
193 |
+
max_generate_audio_seconds],
|
194 |
+
outputs=music_output)
|
195 |
+
|
196 |
+
generate_button = gr.Button("Start Music Continuation Task")
|
197 |
+
generate_button.click(demo_inspiremusic_con,
|
198 |
+
inputs=[text_input, audio_input, model_name,
|
199 |
+
chorus,
|
200 |
+
output_sample_rate,
|
201 |
+
max_generate_audio_seconds],
|
202 |
+
outputs=music_output)
|
203 |
+
t2m_examples = gr.Examples(examples=DEMO_TEXT_PROMPTS, inputs=[text_input])
|
204 |
+
demo.launch()
|
205 |
+
|
206 |
+
if __name__ == '__main__':
|
207 |
+
os.makedirs(AUDIO_PROMPT_DIR, exist_ok=True)
|
208 |
+
os.makedirs(OUTPUT_AUDIO_DIR, exist_ok=True)
|
209 |
+
main()
|