chong.zhang commited on
Commit
3d0f730
·
1 Parent(s): bd63041
Files changed (1) hide show
  1. app.py +182 -178
app.py CHANGED
@@ -13,197 +13,201 @@
13
  # limitations under the License.
14
 
15
  import os
 
 
 
 
 
 
16
 
 
17
  os.system('nvidia-smi')
18
  os.system('apt update -y && apt-get install -y apt-utils && apt install -y unzip')
19
  os.environ['PYTHONPATH'] = 'third_party/Matcha-TTS'
20
- os.system('mkdir pretrained_models && cd pretrained_models && git clone https://huggingface.co/FunAudioLLM/InspireMusic-Base.git &&git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-Long.git &&git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B.git &&git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-24kHz.git &&git clone https://huggingface.co/FunAudioLLM/InspireMusic-Base-24kHz.git && for i in InspireMusic-Base InspireMusic-Base-24kHz InspireMusic-1.5B InspireMusic-1.5B-24kHz InspireMusic-1.5B-Long; do sed -i -e "s/\.\.\/\.\.\///g" ${i}/inspiremusic.yaml; done && cd ..')
21
-
22
- import sys
23
- import torch
 
 
 
 
 
 
 
24
  print(torch.backends.cudnn.version())
25
-
26
  ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
27
- sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
28
-
29
- import spaces
30
- import gradio as gr
31
- from inspiremusic.cli.inference import InspireMusicUnified, set_env_variables
32
- import torchaudio
33
- import datetime
34
- import hashlib
35
- import importlib
36
 
37
- MODELS = ["InspireMusic-1.5B-Long", "InspireMusic-1.5B", "InspireMusic-Base", "InspireMusic-1.5B-24kHz", "InspireMusic-Base-24kHz"]
 
 
38
  AUDIO_PROMPT_DIR = "demo/audio_prompts"
39
  OUTPUT_AUDIO_DIR = "demo/outputs"
40
 
41
- DEMO_TEXT_PROMPTS = ["Jazz music with drum beats.",
42
- "A captivating classical piano performance, this piece exudes a dynamic and intense atmosphere, showcasing intricate and expressive instrumental artistry.",
43
- "A soothing instrumental piece blending elements of light music and pop, featuring a gentle guitar rendition. The overall feel is serene and reflective, likely instrumental with no vocals.",
44
- "The instrumental rock piece features dynamic oscillations and wave-like progressions, creating an immersive and energetic atmosphere. The music is purely instrumental, with no vocals, and it blends elements of rock and post-rock for a powerful and evocative experience.",
45
- "The classical instrumental piece exudes a haunting and evocative atmosphere, characterized by its intricate guitar work and profound emotional depth.",
46
- "Experience a dynamic blend of instrumental electronic music with futuristic house vibes, featuring energetic beats and a captivating rhythm. The tracks are likely instrumental, focusing on the immersive soundscapes rather than vocal performances."]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  def generate_filename():
49
- hash_object = hashlib.sha256(str(int(datetime.datetime.now().timestamp())).encode())
50
- hash_string = hash_object.hexdigest()
51
- return hash_string
52
-
53
- def get_args(
54
- task, text="", audio=None, model_name="InspireMusic-Base",
55
- chorus="intro",
56
- output_sample_rate=48000, max_generate_audio_seconds=30.0, time_start = 0.0, time_end=30.0, trim=False):
57
-
58
- if "24kHz" in model_name:
59
- output_sample_rate = 24000
60
-
61
- if output_sample_rate == 24000:
62
- fast = True
63
- else:
64
- fast = False
65
- # This function constructs the arguments required for InspireMusic
66
- args = {
67
- "task" : task,
68
- "text" : text,
69
- "audio_prompt" : audio,
70
- "model_name" : model_name,
71
- "chorus" : chorus,
72
- "fast" : fast,
73
- "fade_out" : True,
74
- "trim" : trim,
75
- "output_sample_rate" : output_sample_rate,
76
- "min_generate_audio_seconds": 10.0,
77
- "max_generate_audio_seconds": max_generate_audio_seconds,
78
- "max_audio_prompt_length": 5.0,
79
- "model_dir" : os.path.join("pretrained_models",
80
- model_name),
81
- "result_dir" : OUTPUT_AUDIO_DIR,
82
- "output_fn" : generate_filename(),
83
- "format" : "wav",
84
- "time_start" : time_start,
85
- "time_end": time_end,
86
- "fade_out_duration": 1.0,
87
- }
88
-
89
- if args["time_start"] is None:
90
- args["time_start"] = 0.0
91
- args["time_end"] = args["time_start"] + args["max_generate_audio_seconds"]
92
-
93
- print(args)
94
- return args
95
-
96
-
97
- def trim_audio(audio_file, cut_seconds=5):
98
- audio, sr = torchaudio.load(audio_file)
99
- num_samples = cut_seconds * sr
100
- cutted_audio = audio[:, :num_samples]
101
- output_path = os.path.join(AUDIO_PROMPT_DIR, "audio_prompt_" + generate_filename() + ".wav")
102
- torchaudio.save(output_path, cutted_audio, sr)
103
- return output_path
104
-
105
- @spaces.GPU(duration=120)
106
  def music_generation(args):
107
- set_env_variables()
108
- model = InspireMusicUnified(
109
- model_name=args["model_name"],
110
- model_dir=args["model_dir"],
111
- min_generate_audio_seconds=args["min_generate_audio_seconds"],
112
- max_generate_audio_seconds=args["max_generate_audio_seconds"],
113
- sample_rate=24000,
114
- output_sample_rate=args["output_sample_rate"],
115
- load_jit=True,
116
- load_onnx=False,
117
- fast=args["fast"],
118
- result_dir=args["result_dir"])
119
-
120
- output_path = model.inference(
121
- task=args["task"],
122
- text=args["text"],
123
- audio_prompt=args["audio_prompt"],
124
- chorus=args["chorus"],
125
- time_start=args["time_start"],
126
- time_end=args["time_end"],
127
- output_fn=args["output_fn"],
128
- max_audio_prompt_length=args["max_audio_prompt_length"],
129
- fade_out_duration=args["fade_out_duration"],
130
- output_format=args["format"],
131
- fade_out_mode=args["fade_out"],
132
- trim=args["trim"])
133
- return output_path
134
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
- def demo_inspiremusic_t2m(text, model_name, chorus,
137
- output_sample_rate, max_generate_audio_seconds):
138
- args = get_args(
139
- task='text-to-music', text=text, audio=None,
140
- model_name=model_name, chorus=chorus,
141
- output_sample_rate=output_sample_rate,
142
- max_generate_audio_seconds=max_generate_audio_seconds)
143
- return music_generation(args)
144
-
145
- def demo_inspiremusic_con(text, audio, model_name, chorus,
146
- output_sample_rate, max_generate_audio_seconds):
147
- args = get_args(
148
- task='continuation', text=text, audio=trim_audio(audio, cut_seconds=5),
149
- model_name=model_name, chorus=chorus,
150
- output_sample_rate=output_sample_rate,
151
- max_generate_audio_seconds=max_generate_audio_seconds)
152
- return music_generation(args)
153
 
154
  def main():
155
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
156
- gr.Markdown("""
157
- # InspireMusic
158
- - Support music generation tasks with long-form and high audio quality, sampling rates up to 48kHz.
159
- - Github: https://github.com/FunAudioLLM/InspireMusic/ | ModelScope Studio: https://modelscope.cn/studios/iic/InspireMusic
160
- - Available music generation models: [InspireMusic-1.5B-Long](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-Long), [InspireMusic-1.5B](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B), [InspireMusic-Base](https://huggingface.co/FunAudioLLM/InspireMusic-Base), [InspireMusic-1.5B-24kHz](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-24kHz), [InspireMusic-Base-24kHz](https://huggingface.co/FunAudioLLM/InspireMusic-Base-24kHz). Both on Huggingface and ModelScope.
161
- - Currently only support English text prompts.
162
- - This page is for demo purpose, if you want to generate long-form audio, e.g., 5mins, please try to deploy locally. Thank you for your support.
163
- """)
164
-
165
- with gr.Row(equal_height=True):
166
- model_name = gr.Dropdown(
167
- MODELS, label="Select Model Name",
168
- value="InspireMusic-1.5B-Long")
169
- chorus = gr.Dropdown(["intro", "verse", "chorus", "outro"],
170
- label="Chorus Mode", value="intro")
171
- output_sample_rate = gr.Dropdown([48000, 24000],
172
- label="Output Audio Sample Rate (Hz)",
173
- value=48000)
174
- max_generate_audio_seconds = gr.Slider(10, 300,
175
- label="Generate Audio Length (s)",
176
- value=30)
177
-
178
- with gr.Row(equal_height=True):
179
- text_input = gr.Textbox(label="Input Text (For Text-to-Music Task)",
180
- value="Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance.")
181
-
182
- audio_input = gr.Audio(
183
- label="Input Audio Prompt (For Music Continuation Task)",
184
- type="filepath")
185
- music_output = gr.Audio(label="Generated Music", type="filepath", autoplay=True, show_download_button = True)
186
-
187
- with gr.Row():
188
- button = gr.Button("Start Text-to-Music Task")
189
- button.click(demo_inspiremusic_t2m,
190
- inputs=[text_input, model_name,
191
- chorus,
192
- output_sample_rate,
193
- max_generate_audio_seconds],
194
- outputs=music_output)
195
-
196
- generate_button = gr.Button("Start Music Continuation Task")
197
- generate_button.click(demo_inspiremusic_con,
198
- inputs=[text_input, audio_input, model_name,
199
- chorus,
200
- output_sample_rate,
201
- max_generate_audio_seconds],
202
- outputs=music_output)
203
- t2m_examples = gr.Examples(examples=DEMO_TEXT_PROMPTS, inputs=[text_input])
204
- demo.launch()
205
-
206
- if __name__ == '__main__':
207
- os.makedirs(AUDIO_PROMPT_DIR, exist_ok=True)
208
- os.makedirs(OUTPUT_AUDIO_DIR, exist_ok=True)
209
- main()
 
13
  # limitations under the License.
14
 
15
  import os
16
+ import sys
17
+ import torch
18
+ import gradio as gr
19
+ import torchaudio
20
+ import datetime, hashlib
21
+ from inspiremusic.cli.inference import InspireMusicUnified, set_env_variables
22
 
23
+ # Prepare environment and model files (unchanged from original)
24
  os.system('nvidia-smi')
25
  os.system('apt update -y && apt-get install -y apt-utils && apt install -y unzip')
26
  os.environ['PYTHONPATH'] = 'third_party/Matcha-TTS'
27
+ os.system(
28
+ 'mkdir pretrained_models && cd pretrained_models && '
29
+ 'git clone https://huggingface.co/FunAudioLLM/InspireMusic-Base.git && '
30
+ 'git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-Long.git && '
31
+ 'git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B.git && '
32
+ 'git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-24kHz.git && '
33
+ 'git clone https://huggingface.co/FunAudioLLM/InspireMusic-Base-24kHz.git && '
34
+ # Fix paths in YAML files
35
+ 'for i in InspireMusic-Base InspireMusic-Base-24kHz InspireMusic-1.5B InspireMusic-1.5B-24kHz InspireMusic-1.5B-Long; '
36
+ 'do sed -i -e "s/..\/..\///g" ${i}/inspiremusic.yaml; done && cd ..'
37
+ )
38
  print(torch.backends.cudnn.version())
 
39
  ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
40
+ sys.path.append(f"{ROOT_DIR}/third_party/Matcha-TTS")
 
 
 
 
 
 
 
 
41
 
42
+ # Define available model options
43
+ MODELS = ["InspireMusic-1.5B-Long", "InspireMusic-1.5B", "InspireMusic-Base",
44
+ "InspireMusic-1.5B-24kHz", "InspireMusic-Base-24kHz"]
45
  AUDIO_PROMPT_DIR = "demo/audio_prompts"
46
  OUTPUT_AUDIO_DIR = "demo/outputs"
47
 
48
+ # **Initialize global model state at startup**
49
+ loaded_model = None
50
+ current_model_name = None
51
+
52
+ # Set environment variables once (e.g., for torch performance, precision settings)
53
+ set_env_variables()
54
+
55
+ # Load the default model into GPU memory
56
+ current_model_name = "InspireMusic-1.5B-Long" # default selected model in the UI
57
+ loaded_model = InspireMusicUnified(
58
+ model_name=current_model_name,
59
+ model_dir=os.path.join("pretrained_models", current_model_name),
60
+ min_generate_audio_seconds=10.0,
61
+ max_generate_audio_seconds=30.0,
62
+ sample_rate=24000,
63
+ output_sample_rate=48000, # 48kHz output for default (non-24kHz model)
64
+ load_jit=True,
65
+ load_onnx=False,
66
+ fast=False, # False because 48000 Hz output (not fast mode)
67
+ result_dir=OUTPUT_AUDIO_DIR
68
+ )
69
+ # (The model is now loaded on the GPU and ready for reuse)
70
 
71
  def generate_filename():
72
+ # ... (unchanged: generates a unique filename for outputs)
73
+ timestamp = str(int(datetime.datetime.now().timestamp())).encode()
74
+ hash_object = hashlib.sha256(timestamp)
75
+ return hash_object.hexdigest()[:10]
76
+
77
+ def get_args(task, text="", audio=None, model_name="InspireMusic-Base",
78
+ chorus="intro", output_sample_rate=48000, max_generate_audio_seconds=30.0,
79
+ time_start=0.0, time_end=30.0, trim=False):
80
+ """Prepare the arguments dictionary for a generation task."""
81
+ # If a 24kHz model is selected, force output sample rate to 24000
82
+ if "24kHz" in model_name:
83
+ output_sample_rate = 24000
84
+ # Determine fast mode (True if using 24k output, which skips upsampling)
85
+ fast = True if output_sample_rate == 24000 else False
86
+ args = {
87
+ "task": task,
88
+ "text": text,
89
+ "audio_prompt": audio,
90
+ "model_name": model_name,
91
+ "chorus": chorus,
92
+ "fast": fast,
93
+ "fade_out": True,
94
+ "trim": trim,
95
+ "output_sample_rate": output_sample_rate,
96
+ "min_generate_audio_seconds": 10.0,
97
+ "max_generate_audio_seconds": max_generate_audio_seconds,
98
+ "max_audio_prompt_length": 5.0,
99
+ "model_dir": os.path.join("pretrained_models", model_name),
100
+ "result_dir": OUTPUT_AUDIO_DIR,
101
+ "output_fn": generate_filename(),
102
+ "format": "wav",
103
+ "time_start": time_start or 0.0,
104
+ "time_end": time_end or (time_start + max_generate_audio_seconds),
105
+ "fade_out_duration": 1.0,
106
+ }
107
+ return args
108
+
109
+ # **Refactored inference function using the preloaded model**
110
+ @spaces.GPU()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  def music_generation(args):
112
+ """Generate music using the InspireMusic model, reusing a preloaded model if available."""
113
+ global loaded_model, current_model_name
114
+ requested_model = args["model_name"]
115
+ # If the requested model is not the one currently loaded, load the new model
116
+ if loaded_model is None or requested_model != current_model_name:
117
+ # Free GPU memory from the old model
118
+ if loaded_model is not None:
119
+ del loaded_model
120
+ torch.cuda.empty_cache() # free cached memory​:contentReference[oaicite:10]{index=10}
121
+ # Load the requested model into GPU memory
122
+ loaded_model = InspireMusicUnified(
123
+ model_name=requested_model,
124
+ model_dir=args["model_dir"],
125
+ min_generate_audio_seconds=args["min_generate_audio_seconds"],
126
+ max_generate_audio_seconds=args["max_generate_audio_seconds"],
127
+ sample_rate=24000,
128
+ output_sample_rate=args["output_sample_rate"],
129
+ load_jit=True,
130
+ load_onnx=False,
131
+ fast=args["fast"],
132
+ result_dir=args["result_dir"]
133
+ )
134
+ current_model_name = requested_model
135
+ # Perform inference with the loaded model (no gradient computation needed)
136
+ with torch.no_grad(): # disable grad to save memory​:contentReference[oaicite:11]{index=11}​:contentReference[oaicite:12]{index=12}
137
+ output_path = loaded_model.inference(
138
+ task=args["task"],
139
+ text=args["text"],
140
+ audio_prompt=args["audio_prompt"],
141
+ chorus=args["chorus"],
142
+ time_start=args["time_start"],
143
+ time_end=args["time_end"],
144
+ output_fn=args["output_fn"],
145
+ max_audio_prompt_length=args["max_audio_prompt_length"],
146
+ fade_out_duration=args["fade_out_duration"],
147
+ output_format=args["format"],
148
+ fade_out_mode=args["fade_out"],
149
+ trim=args["trim"]
150
+ )
151
+ return output_path
152
+
153
+ # Demo helper functions (using music_generation internally)
154
+ def demo_inspiremusic_t2m(text, model_name, chorus, output_sample_rate, max_generate_audio_seconds):
155
+ args = get_args(task="text-to-music", text=text, audio=None,
156
+ model_name=model_name, chorus=chorus,
157
+ output_sample_rate=output_sample_rate,
158
+ max_generate_audio_seconds=max_generate_audio_seconds)
159
+ return music_generation(args)
160
+
161
+ def demo_inspiremusic_con(text, audio, model_name, chorus, output_sample_rate, max_generate_audio_seconds):
162
+ # Trim the audio prompt to 5 seconds and use it for continuation
163
+ trimmed_audio = trim_audio(audio, cut_seconds=5)
164
+ args = get_args(task="continuation", text=text, audio=trimmed_audio,
165
+ model_name=model_name, chorus=chorus,
166
+ output_sample_rate=output_sample_rate,
167
+ max_generate_audio_seconds=max_generate_audio_seconds)
168
+ return music_generation(args)
169
 
170
+ def trim_audio(audio_file, cut_seconds=5):
171
+ # ... (unchanged: load audio and trim to first 5 seconds)
172
+ audio_tensor, sr = torchaudio.load(audio_file)
173
+ num_samples = int(cut_seconds * sr)
174
+ trimmed_audio = audio_tensor[:, :num_samples]
175
+ output_path = os.path.join(AUDIO_PROMPT_DIR, "audio_prompt_" + generate_filename() + ".wav")
176
+ torchaudio.save(output_path, trimmed_audio, sr)
177
+ return output_path
 
 
 
 
 
 
 
 
 
178
 
179
  def main():
180
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
181
+ gr.Markdown("# InspireMusic\n"
182
+ "- A demo for music generation with high audio quality (up to 48kHz) and long-form capabilities.\n"
183
+ "- GitHub: https://github.com/FunAudioLLM/InspireMusic\n"
184
+ "- Available models: InspireMusic-1.5B-Long, InspireMusic-1.5B, InspireMusic-Base, InspireMusic-1.5B-24kHz, InspireMusic-Base-24kHz (on Hugging Face and ModelScope).\n"
185
+ "*(Note: Only English text prompts are supported.)*")
186
+ # Input components
187
+ model_name = gr.Dropdown(MODELS, label="Select Model Name", value="InspireMusic-1.5B-Long")
188
+ chorus = gr.Dropdown(["intro", "verse", "chorus", "outro"], label="Chorus Mode", value="intro")
189
+ output_sample_rate = gr.Dropdown([48000, 24000], label="Output Audio Sample Rate (Hz)", value=48000)
190
+ max_generate_audio_seconds = gr.Slider(10, 300, label="Generate Audio Length (s)", value=30)
191
+ with gr.Row():
192
+ text_input = gr.Textbox(label="Input Text (For Text-to-Music Task)", value="Experience soothing ... ambiance.")
193
+ audio_input = gr.Audio(label="Input Audio Prompt (For Music Continuation Task)", type="filepath")
194
+ music_output = gr.Audio(label="Generated Music", type="filepath", autoplay=True, show_download_button=True)
195
+ # Buttons to trigger generation
196
+ with gr.Row():
197
+ t2m_button = gr.Button("Start Text-to-Music Task")
198
+ con_button = gr.Button("Start Music Continuation Task")
199
+ # Bind button clicks to the respective functions
200
+ t2m_button.click(fn=demo_inspiremusic_t2m,
201
+ inputs=[text_input, model_name, chorus, output_sample_rate, max_generate_audio_seconds],
202
+ outputs=music_output)
203
+ con_button.click(fn=demo_inspiremusic_con,
204
+ inputs=[text_input, audio_input, model_name, chorus, output_sample_rate, max_generate_audio_seconds],
205
+ outputs=music_output)
206
+ gr.Examples(examples=[...], inputs=[text_input]) # (example prompts list truncated for brevity)
207
+ demo.launch()
208
+
209
+ if __name__ == "__main__":
210
+ # Ensure output directories exist
211
+ os.makedirs(AUDIO_PROMPT_DIR, exist_ok=True)
212
+ os.makedirs(OUTPUT_AUDIO_DIR, exist_ok=True)
213
+ main()