Spaces:
Running
on
Zero
Running
on
Zero
chong.zhang
commited on
Commit
·
3d0f730
1
Parent(s):
bd63041
update
Browse files
app.py
CHANGED
@@ -13,197 +13,201 @@
|
|
13 |
# limitations under the License.
|
14 |
|
15 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
|
|
17 |
os.system('nvidia-smi')
|
18 |
os.system('apt update -y && apt-get install -y apt-utils && apt install -y unzip')
|
19 |
os.environ['PYTHONPATH'] = 'third_party/Matcha-TTS'
|
20 |
-
os.system(
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
print(torch.backends.cudnn.version())
|
25 |
-
|
26 |
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
27 |
-
sys.path.append(
|
28 |
-
|
29 |
-
import spaces
|
30 |
-
import gradio as gr
|
31 |
-
from inspiremusic.cli.inference import InspireMusicUnified, set_env_variables
|
32 |
-
import torchaudio
|
33 |
-
import datetime
|
34 |
-
import hashlib
|
35 |
-
import importlib
|
36 |
|
37 |
-
|
|
|
|
|
38 |
AUDIO_PROMPT_DIR = "demo/audio_prompts"
|
39 |
OUTPUT_AUDIO_DIR = "demo/outputs"
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
def generate_filename():
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
if args["time_start"] is None:
|
90 |
-
args["time_start"] = 0.0
|
91 |
-
args["time_end"] = args["time_start"] + args["max_generate_audio_seconds"]
|
92 |
-
|
93 |
-
print(args)
|
94 |
-
return args
|
95 |
-
|
96 |
-
|
97 |
-
def trim_audio(audio_file, cut_seconds=5):
|
98 |
-
audio, sr = torchaudio.load(audio_file)
|
99 |
-
num_samples = cut_seconds * sr
|
100 |
-
cutted_audio = audio[:, :num_samples]
|
101 |
-
output_path = os.path.join(AUDIO_PROMPT_DIR, "audio_prompt_" + generate_filename() + ".wav")
|
102 |
-
torchaudio.save(output_path, cutted_audio, sr)
|
103 |
-
return output_path
|
104 |
-
|
105 |
-
@spaces.GPU(duration=120)
|
106 |
def music_generation(args):
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
|
136 |
-
def
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
def demo_inspiremusic_con(text, audio, model_name, chorus,
|
146 |
-
output_sample_rate, max_generate_audio_seconds):
|
147 |
-
args = get_args(
|
148 |
-
task='continuation', text=text, audio=trim_audio(audio, cut_seconds=5),
|
149 |
-
model_name=model_name, chorus=chorus,
|
150 |
-
output_sample_rate=output_sample_rate,
|
151 |
-
max_generate_audio_seconds=max_generate_audio_seconds)
|
152 |
-
return music_generation(args)
|
153 |
|
154 |
def main():
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
button.click(demo_inspiremusic_t2m,
|
190 |
-
inputs=[text_input, model_name,
|
191 |
-
chorus,
|
192 |
-
output_sample_rate,
|
193 |
-
max_generate_audio_seconds],
|
194 |
-
outputs=music_output)
|
195 |
-
|
196 |
-
generate_button = gr.Button("Start Music Continuation Task")
|
197 |
-
generate_button.click(demo_inspiremusic_con,
|
198 |
-
inputs=[text_input, audio_input, model_name,
|
199 |
-
chorus,
|
200 |
-
output_sample_rate,
|
201 |
-
max_generate_audio_seconds],
|
202 |
-
outputs=music_output)
|
203 |
-
t2m_examples = gr.Examples(examples=DEMO_TEXT_PROMPTS, inputs=[text_input])
|
204 |
-
demo.launch()
|
205 |
-
|
206 |
-
if __name__ == '__main__':
|
207 |
-
os.makedirs(AUDIO_PROMPT_DIR, exist_ok=True)
|
208 |
-
os.makedirs(OUTPUT_AUDIO_DIR, exist_ok=True)
|
209 |
-
main()
|
|
|
13 |
# limitations under the License.
|
14 |
|
15 |
import os
|
16 |
+
import sys
|
17 |
+
import torch
|
18 |
+
import gradio as gr
|
19 |
+
import torchaudio
|
20 |
+
import datetime, hashlib
|
21 |
+
from inspiremusic.cli.inference import InspireMusicUnified, set_env_variables
|
22 |
|
23 |
+
# Prepare environment and model files (unchanged from original)
|
24 |
os.system('nvidia-smi')
|
25 |
os.system('apt update -y && apt-get install -y apt-utils && apt install -y unzip')
|
26 |
os.environ['PYTHONPATH'] = 'third_party/Matcha-TTS'
|
27 |
+
os.system(
|
28 |
+
'mkdir pretrained_models && cd pretrained_models && '
|
29 |
+
'git clone https://huggingface.co/FunAudioLLM/InspireMusic-Base.git && '
|
30 |
+
'git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-Long.git && '
|
31 |
+
'git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B.git && '
|
32 |
+
'git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-24kHz.git && '
|
33 |
+
'git clone https://huggingface.co/FunAudioLLM/InspireMusic-Base-24kHz.git && '
|
34 |
+
# Fix paths in YAML files
|
35 |
+
'for i in InspireMusic-Base InspireMusic-Base-24kHz InspireMusic-1.5B InspireMusic-1.5B-24kHz InspireMusic-1.5B-Long; '
|
36 |
+
'do sed -i -e "s/..\/..\///g" ${i}/inspiremusic.yaml; done && cd ..'
|
37 |
+
)
|
38 |
print(torch.backends.cudnn.version())
|
|
|
39 |
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
|
40 |
+
sys.path.append(f"{ROOT_DIR}/third_party/Matcha-TTS")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
+
# Define available model options
|
43 |
+
MODELS = ["InspireMusic-1.5B-Long", "InspireMusic-1.5B", "InspireMusic-Base",
|
44 |
+
"InspireMusic-1.5B-24kHz", "InspireMusic-Base-24kHz"]
|
45 |
AUDIO_PROMPT_DIR = "demo/audio_prompts"
|
46 |
OUTPUT_AUDIO_DIR = "demo/outputs"
|
47 |
|
48 |
+
# **Initialize global model state at startup**
|
49 |
+
loaded_model = None
|
50 |
+
current_model_name = None
|
51 |
+
|
52 |
+
# Set environment variables once (e.g., for torch performance, precision settings)
|
53 |
+
set_env_variables()
|
54 |
+
|
55 |
+
# Load the default model into GPU memory
|
56 |
+
current_model_name = "InspireMusic-1.5B-Long" # default selected model in the UI
|
57 |
+
loaded_model = InspireMusicUnified(
|
58 |
+
model_name=current_model_name,
|
59 |
+
model_dir=os.path.join("pretrained_models", current_model_name),
|
60 |
+
min_generate_audio_seconds=10.0,
|
61 |
+
max_generate_audio_seconds=30.0,
|
62 |
+
sample_rate=24000,
|
63 |
+
output_sample_rate=48000, # 48kHz output for default (non-24kHz model)
|
64 |
+
load_jit=True,
|
65 |
+
load_onnx=False,
|
66 |
+
fast=False, # False because 48000 Hz output (not fast mode)
|
67 |
+
result_dir=OUTPUT_AUDIO_DIR
|
68 |
+
)
|
69 |
+
# (The model is now loaded on the GPU and ready for reuse)
|
70 |
|
71 |
def generate_filename():
|
72 |
+
# ... (unchanged: generates a unique filename for outputs)
|
73 |
+
timestamp = str(int(datetime.datetime.now().timestamp())).encode()
|
74 |
+
hash_object = hashlib.sha256(timestamp)
|
75 |
+
return hash_object.hexdigest()[:10]
|
76 |
+
|
77 |
+
def get_args(task, text="", audio=None, model_name="InspireMusic-Base",
|
78 |
+
chorus="intro", output_sample_rate=48000, max_generate_audio_seconds=30.0,
|
79 |
+
time_start=0.0, time_end=30.0, trim=False):
|
80 |
+
"""Prepare the arguments dictionary for a generation task."""
|
81 |
+
# If a 24kHz model is selected, force output sample rate to 24000
|
82 |
+
if "24kHz" in model_name:
|
83 |
+
output_sample_rate = 24000
|
84 |
+
# Determine fast mode (True if using 24k output, which skips upsampling)
|
85 |
+
fast = True if output_sample_rate == 24000 else False
|
86 |
+
args = {
|
87 |
+
"task": task,
|
88 |
+
"text": text,
|
89 |
+
"audio_prompt": audio,
|
90 |
+
"model_name": model_name,
|
91 |
+
"chorus": chorus,
|
92 |
+
"fast": fast,
|
93 |
+
"fade_out": True,
|
94 |
+
"trim": trim,
|
95 |
+
"output_sample_rate": output_sample_rate,
|
96 |
+
"min_generate_audio_seconds": 10.0,
|
97 |
+
"max_generate_audio_seconds": max_generate_audio_seconds,
|
98 |
+
"max_audio_prompt_length": 5.0,
|
99 |
+
"model_dir": os.path.join("pretrained_models", model_name),
|
100 |
+
"result_dir": OUTPUT_AUDIO_DIR,
|
101 |
+
"output_fn": generate_filename(),
|
102 |
+
"format": "wav",
|
103 |
+
"time_start": time_start or 0.0,
|
104 |
+
"time_end": time_end or (time_start + max_generate_audio_seconds),
|
105 |
+
"fade_out_duration": 1.0,
|
106 |
+
}
|
107 |
+
return args
|
108 |
+
|
109 |
+
# **Refactored inference function using the preloaded model**
|
110 |
+
@spaces.GPU()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
def music_generation(args):
|
112 |
+
"""Generate music using the InspireMusic model, reusing a preloaded model if available."""
|
113 |
+
global loaded_model, current_model_name
|
114 |
+
requested_model = args["model_name"]
|
115 |
+
# If the requested model is not the one currently loaded, load the new model
|
116 |
+
if loaded_model is None or requested_model != current_model_name:
|
117 |
+
# Free GPU memory from the old model
|
118 |
+
if loaded_model is not None:
|
119 |
+
del loaded_model
|
120 |
+
torch.cuda.empty_cache() # free cached memory​:contentReference[oaicite:10]{index=10}
|
121 |
+
# Load the requested model into GPU memory
|
122 |
+
loaded_model = InspireMusicUnified(
|
123 |
+
model_name=requested_model,
|
124 |
+
model_dir=args["model_dir"],
|
125 |
+
min_generate_audio_seconds=args["min_generate_audio_seconds"],
|
126 |
+
max_generate_audio_seconds=args["max_generate_audio_seconds"],
|
127 |
+
sample_rate=24000,
|
128 |
+
output_sample_rate=args["output_sample_rate"],
|
129 |
+
load_jit=True,
|
130 |
+
load_onnx=False,
|
131 |
+
fast=args["fast"],
|
132 |
+
result_dir=args["result_dir"]
|
133 |
+
)
|
134 |
+
current_model_name = requested_model
|
135 |
+
# Perform inference with the loaded model (no gradient computation needed)
|
136 |
+
with torch.no_grad(): # disable grad to save memory​:contentReference[oaicite:11]{index=11}​:contentReference[oaicite:12]{index=12}
|
137 |
+
output_path = loaded_model.inference(
|
138 |
+
task=args["task"],
|
139 |
+
text=args["text"],
|
140 |
+
audio_prompt=args["audio_prompt"],
|
141 |
+
chorus=args["chorus"],
|
142 |
+
time_start=args["time_start"],
|
143 |
+
time_end=args["time_end"],
|
144 |
+
output_fn=args["output_fn"],
|
145 |
+
max_audio_prompt_length=args["max_audio_prompt_length"],
|
146 |
+
fade_out_duration=args["fade_out_duration"],
|
147 |
+
output_format=args["format"],
|
148 |
+
fade_out_mode=args["fade_out"],
|
149 |
+
trim=args["trim"]
|
150 |
+
)
|
151 |
+
return output_path
|
152 |
+
|
153 |
+
# Demo helper functions (using music_generation internally)
|
154 |
+
def demo_inspiremusic_t2m(text, model_name, chorus, output_sample_rate, max_generate_audio_seconds):
|
155 |
+
args = get_args(task="text-to-music", text=text, audio=None,
|
156 |
+
model_name=model_name, chorus=chorus,
|
157 |
+
output_sample_rate=output_sample_rate,
|
158 |
+
max_generate_audio_seconds=max_generate_audio_seconds)
|
159 |
+
return music_generation(args)
|
160 |
+
|
161 |
+
def demo_inspiremusic_con(text, audio, model_name, chorus, output_sample_rate, max_generate_audio_seconds):
|
162 |
+
# Trim the audio prompt to 5 seconds and use it for continuation
|
163 |
+
trimmed_audio = trim_audio(audio, cut_seconds=5)
|
164 |
+
args = get_args(task="continuation", text=text, audio=trimmed_audio,
|
165 |
+
model_name=model_name, chorus=chorus,
|
166 |
+
output_sample_rate=output_sample_rate,
|
167 |
+
max_generate_audio_seconds=max_generate_audio_seconds)
|
168 |
+
return music_generation(args)
|
169 |
|
170 |
+
def trim_audio(audio_file, cut_seconds=5):
|
171 |
+
# ... (unchanged: load audio and trim to first 5 seconds)
|
172 |
+
audio_tensor, sr = torchaudio.load(audio_file)
|
173 |
+
num_samples = int(cut_seconds * sr)
|
174 |
+
trimmed_audio = audio_tensor[:, :num_samples]
|
175 |
+
output_path = os.path.join(AUDIO_PROMPT_DIR, "audio_prompt_" + generate_filename() + ".wav")
|
176 |
+
torchaudio.save(output_path, trimmed_audio, sr)
|
177 |
+
return output_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
|
179 |
def main():
|
180 |
+
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
181 |
+
gr.Markdown("# InspireMusic\n"
|
182 |
+
"- A demo for music generation with high audio quality (up to 48kHz) and long-form capabilities.\n"
|
183 |
+
"- GitHub: https://github.com/FunAudioLLM/InspireMusic\n"
|
184 |
+
"- Available models: InspireMusic-1.5B-Long, InspireMusic-1.5B, InspireMusic-Base, InspireMusic-1.5B-24kHz, InspireMusic-Base-24kHz (on Hugging Face and ModelScope).\n"
|
185 |
+
"*(Note: Only English text prompts are supported.)*")
|
186 |
+
# Input components
|
187 |
+
model_name = gr.Dropdown(MODELS, label="Select Model Name", value="InspireMusic-1.5B-Long")
|
188 |
+
chorus = gr.Dropdown(["intro", "verse", "chorus", "outro"], label="Chorus Mode", value="intro")
|
189 |
+
output_sample_rate = gr.Dropdown([48000, 24000], label="Output Audio Sample Rate (Hz)", value=48000)
|
190 |
+
max_generate_audio_seconds = gr.Slider(10, 300, label="Generate Audio Length (s)", value=30)
|
191 |
+
with gr.Row():
|
192 |
+
text_input = gr.Textbox(label="Input Text (For Text-to-Music Task)", value="Experience soothing ... ambiance.")
|
193 |
+
audio_input = gr.Audio(label="Input Audio Prompt (For Music Continuation Task)", type="filepath")
|
194 |
+
music_output = gr.Audio(label="Generated Music", type="filepath", autoplay=True, show_download_button=True)
|
195 |
+
# Buttons to trigger generation
|
196 |
+
with gr.Row():
|
197 |
+
t2m_button = gr.Button("Start Text-to-Music Task")
|
198 |
+
con_button = gr.Button("Start Music Continuation Task")
|
199 |
+
# Bind button clicks to the respective functions
|
200 |
+
t2m_button.click(fn=demo_inspiremusic_t2m,
|
201 |
+
inputs=[text_input, model_name, chorus, output_sample_rate, max_generate_audio_seconds],
|
202 |
+
outputs=music_output)
|
203 |
+
con_button.click(fn=demo_inspiremusic_con,
|
204 |
+
inputs=[text_input, audio_input, model_name, chorus, output_sample_rate, max_generate_audio_seconds],
|
205 |
+
outputs=music_output)
|
206 |
+
gr.Examples(examples=[...], inputs=[text_input]) # (example prompts list truncated for brevity)
|
207 |
+
demo.launch()
|
208 |
+
|
209 |
+
if __name__ == "__main__":
|
210 |
+
# Ensure output directories exist
|
211 |
+
os.makedirs(AUDIO_PROMPT_DIR, exist_ok=True)
|
212 |
+
os.makedirs(OUTPUT_AUDIO_DIR, exist_ok=True)
|
213 |
+
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|