chong.zhang commited on
Commit
c72a6e3
·
1 Parent(s): 3d0f730
Files changed (1) hide show
  1. app.py +178 -182
app.py CHANGED
@@ -13,201 +13,197 @@
13
  # limitations under the License.
14
 
15
  import os
16
- import sys
17
- import torch
18
- import gradio as gr
19
- import torchaudio
20
- import datetime, hashlib
21
- from inspiremusic.cli.inference import InspireMusicUnified, set_env_variables
22
 
23
- # Prepare environment and model files (unchanged from original)
24
  os.system('nvidia-smi')
25
  os.system('apt update -y && apt-get install -y apt-utils && apt install -y unzip')
26
  os.environ['PYTHONPATH'] = 'third_party/Matcha-TTS'
27
- os.system(
28
- 'mkdir pretrained_models && cd pretrained_models && '
29
- 'git clone https://huggingface.co/FunAudioLLM/InspireMusic-Base.git && '
30
- 'git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-Long.git && '
31
- 'git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B.git && '
32
- 'git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-24kHz.git && '
33
- 'git clone https://huggingface.co/FunAudioLLM/InspireMusic-Base-24kHz.git && '
34
- # Fix paths in YAML files
35
- 'for i in InspireMusic-Base InspireMusic-Base-24kHz InspireMusic-1.5B InspireMusic-1.5B-24kHz InspireMusic-1.5B-Long; '
36
- 'do sed -i -e "s/..\/..\///g" ${i}/inspiremusic.yaml; done && cd ..'
37
- )
38
  print(torch.backends.cudnn.version())
 
39
  ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
40
- sys.path.append(f"{ROOT_DIR}/third_party/Matcha-TTS")
 
 
 
 
 
 
 
 
41
 
42
- # Define available model options
43
- MODELS = ["InspireMusic-1.5B-Long", "InspireMusic-1.5B", "InspireMusic-Base",
44
- "InspireMusic-1.5B-24kHz", "InspireMusic-Base-24kHz"]
45
  AUDIO_PROMPT_DIR = "demo/audio_prompts"
46
  OUTPUT_AUDIO_DIR = "demo/outputs"
47
 
48
- # **Initialize global model state at startup**
49
- loaded_model = None
50
- current_model_name = None
51
-
52
- # Set environment variables once (e.g., for torch performance, precision settings)
53
- set_env_variables()
54
-
55
- # Load the default model into GPU memory
56
- current_model_name = "InspireMusic-1.5B-Long" # default selected model in the UI
57
- loaded_model = InspireMusicUnified(
58
- model_name=current_model_name,
59
- model_dir=os.path.join("pretrained_models", current_model_name),
60
- min_generate_audio_seconds=10.0,
61
- max_generate_audio_seconds=30.0,
62
- sample_rate=24000,
63
- output_sample_rate=48000, # 48kHz output for default (non-24kHz model)
64
- load_jit=True,
65
- load_onnx=False,
66
- fast=False, # False because 48000 Hz output (not fast mode)
67
- result_dir=OUTPUT_AUDIO_DIR
68
- )
69
- # (The model is now loaded on the GPU and ready for reuse)
70
 
71
  def generate_filename():
72
- # ... (unchanged: generates a unique filename for outputs)
73
- timestamp = str(int(datetime.datetime.now().timestamp())).encode()
74
- hash_object = hashlib.sha256(timestamp)
75
- return hash_object.hexdigest()[:10]
76
-
77
- def get_args(task, text="", audio=None, model_name="InspireMusic-Base",
78
- chorus="intro", output_sample_rate=48000, max_generate_audio_seconds=30.0,
79
- time_start=0.0, time_end=30.0, trim=False):
80
- """Prepare the arguments dictionary for a generation task."""
81
- # If a 24kHz model is selected, force output sample rate to 24000
82
- if "24kHz" in model_name:
83
- output_sample_rate = 24000
84
- # Determine fast mode (True if using 24k output, which skips upsampling)
85
- fast = True if output_sample_rate == 24000 else False
86
- args = {
87
- "task": task,
88
- "text": text,
89
- "audio_prompt": audio,
90
- "model_name": model_name,
91
- "chorus": chorus,
92
- "fast": fast,
93
- "fade_out": True,
94
- "trim": trim,
95
- "output_sample_rate": output_sample_rate,
96
- "min_generate_audio_seconds": 10.0,
97
- "max_generate_audio_seconds": max_generate_audio_seconds,
98
- "max_audio_prompt_length": 5.0,
99
- "model_dir": os.path.join("pretrained_models", model_name),
100
- "result_dir": OUTPUT_AUDIO_DIR,
101
- "output_fn": generate_filename(),
102
- "format": "wav",
103
- "time_start": time_start or 0.0,
104
- "time_end": time_end or (time_start + max_generate_audio_seconds),
105
- "fade_out_duration": 1.0,
106
- }
107
- return args
108
-
109
- # **Refactored inference function using the preloaded model**
110
- @spaces.GPU()
111
- def music_generation(args):
112
- """Generate music using the InspireMusic model, reusing a preloaded model if available."""
113
- global loaded_model, current_model_name
114
- requested_model = args["model_name"]
115
- # If the requested model is not the one currently loaded, load the new model
116
- if loaded_model is None or requested_model != current_model_name:
117
- # Free GPU memory from the old model
118
- if loaded_model is not None:
119
- del loaded_model
120
- torch.cuda.empty_cache() # free cached memory​:contentReference[oaicite:10]{index=10}
121
- # Load the requested model into GPU memory
122
- loaded_model = InspireMusicUnified(
123
- model_name=requested_model,
124
- model_dir=args["model_dir"],
125
- min_generate_audio_seconds=args["min_generate_audio_seconds"],
126
- max_generate_audio_seconds=args["max_generate_audio_seconds"],
127
- sample_rate=24000,
128
- output_sample_rate=args["output_sample_rate"],
129
- load_jit=True,
130
- load_onnx=False,
131
- fast=args["fast"],
132
- result_dir=args["result_dir"]
133
- )
134
- current_model_name = requested_model
135
- # Perform inference with the loaded model (no gradient computation needed)
136
- with torch.no_grad(): # disable grad to save memory​:contentReference[oaicite:11]{index=11}​:contentReference[oaicite:12]{index=12}
137
- output_path = loaded_model.inference(
138
- task=args["task"],
139
- text=args["text"],
140
- audio_prompt=args["audio_prompt"],
141
- chorus=args["chorus"],
142
- time_start=args["time_start"],
143
- time_end=args["time_end"],
144
- output_fn=args["output_fn"],
145
- max_audio_prompt_length=args["max_audio_prompt_length"],
146
- fade_out_duration=args["fade_out_duration"],
147
- output_format=args["format"],
148
- fade_out_mode=args["fade_out"],
149
- trim=args["trim"]
150
- )
151
- return output_path
152
-
153
- # Demo helper functions (using music_generation internally)
154
- def demo_inspiremusic_t2m(text, model_name, chorus, output_sample_rate, max_generate_audio_seconds):
155
- args = get_args(task="text-to-music", text=text, audio=None,
156
- model_name=model_name, chorus=chorus,
157
- output_sample_rate=output_sample_rate,
158
- max_generate_audio_seconds=max_generate_audio_seconds)
159
- return music_generation(args)
160
-
161
- def demo_inspiremusic_con(text, audio, model_name, chorus, output_sample_rate, max_generate_audio_seconds):
162
- # Trim the audio prompt to 5 seconds and use it for continuation
163
- trimmed_audio = trim_audio(audio, cut_seconds=5)
164
- args = get_args(task="continuation", text=text, audio=trimmed_audio,
165
- model_name=model_name, chorus=chorus,
166
- output_sample_rate=output_sample_rate,
167
- max_generate_audio_seconds=max_generate_audio_seconds)
168
- return music_generation(args)
169
 
170
  def trim_audio(audio_file, cut_seconds=5):
171
- # ... (unchanged: load audio and trim to first 5 seconds)
172
- audio_tensor, sr = torchaudio.load(audio_file)
173
- num_samples = int(cut_seconds * sr)
174
- trimmed_audio = audio_tensor[:, :num_samples]
175
- output_path = os.path.join(AUDIO_PROMPT_DIR, "audio_prompt_" + generate_filename() + ".wav")
176
- torchaudio.save(output_path, trimmed_audio, sr)
177
- return output_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
  def main():
180
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
181
- gr.Markdown("# InspireMusic\n"
182
- "- A demo for music generation with high audio quality (up to 48kHz) and long-form capabilities.\n"
183
- "- GitHub: https://github.com/FunAudioLLM/InspireMusic\n"
184
- "- Available models: InspireMusic-1.5B-Long, InspireMusic-1.5B, InspireMusic-Base, InspireMusic-1.5B-24kHz, InspireMusic-Base-24kHz (on Hugging Face and ModelScope).\n"
185
- "*(Note: Only English text prompts are supported.)*")
186
- # Input components
187
- model_name = gr.Dropdown(MODELS, label="Select Model Name", value="InspireMusic-1.5B-Long")
188
- chorus = gr.Dropdown(["intro", "verse", "chorus", "outro"], label="Chorus Mode", value="intro")
189
- output_sample_rate = gr.Dropdown([48000, 24000], label="Output Audio Sample Rate (Hz)", value=48000)
190
- max_generate_audio_seconds = gr.Slider(10, 300, label="Generate Audio Length (s)", value=30)
191
- with gr.Row():
192
- text_input = gr.Textbox(label="Input Text (For Text-to-Music Task)", value="Experience soothing ... ambiance.")
193
- audio_input = gr.Audio(label="Input Audio Prompt (For Music Continuation Task)", type="filepath")
194
- music_output = gr.Audio(label="Generated Music", type="filepath", autoplay=True, show_download_button=True)
195
- # Buttons to trigger generation
196
- with gr.Row():
197
- t2m_button = gr.Button("Start Text-to-Music Task")
198
- con_button = gr.Button("Start Music Continuation Task")
199
- # Bind button clicks to the respective functions
200
- t2m_button.click(fn=demo_inspiremusic_t2m,
201
- inputs=[text_input, model_name, chorus, output_sample_rate, max_generate_audio_seconds],
202
- outputs=music_output)
203
- con_button.click(fn=demo_inspiremusic_con,
204
- inputs=[text_input, audio_input, model_name, chorus, output_sample_rate, max_generate_audio_seconds],
205
- outputs=music_output)
206
- gr.Examples(examples=[...], inputs=[text_input]) # (example prompts list truncated for brevity)
207
- demo.launch()
208
-
209
- if __name__ == "__main__":
210
- # Ensure output directories exist
211
- os.makedirs(AUDIO_PROMPT_DIR, exist_ok=True)
212
- os.makedirs(OUTPUT_AUDIO_DIR, exist_ok=True)
213
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  # limitations under the License.
14
 
15
  import os
 
 
 
 
 
 
16
 
 
17
  os.system('nvidia-smi')
18
  os.system('apt update -y && apt-get install -y apt-utils && apt install -y unzip')
19
  os.environ['PYTHONPATH'] = 'third_party/Matcha-TTS'
20
+ os.system('mkdir pretrained_models && cd pretrained_models && git clone https://huggingface.co/FunAudioLLM/InspireMusic-Base.git &&git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-Long.git &&git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B.git &&git clone https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-24kHz.git &&git clone https://huggingface.co/FunAudioLLM/InspireMusic-Base-24kHz.git && for i in InspireMusic-Base InspireMusic-Base-24kHz InspireMusic-1.5B InspireMusic-1.5B-24kHz InspireMusic-1.5B-Long; do sed -i -e "s/\.\.\/\.\.\///g" ${i}/inspiremusic.yaml; done && cd ..')
21
+
22
+ import sys
23
+ import torch
 
 
 
 
 
 
 
24
  print(torch.backends.cudnn.version())
25
+
26
  ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
27
+ sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
28
+
29
+ import spaces
30
+ import gradio as gr
31
+ from inspiremusic.cli.inference import InspireMusicUnified, set_env_variables
32
+ import torchaudio
33
+ import datetime
34
+ import hashlib
35
+ import importlib
36
 
37
+ MODELS = ["InspireMusic-1.5B-Long", "InspireMusic-1.5B", "InspireMusic-Base", "InspireMusic-1.5B-24kHz", "InspireMusic-Base-24kHz"]
 
 
38
  AUDIO_PROMPT_DIR = "demo/audio_prompts"
39
  OUTPUT_AUDIO_DIR = "demo/outputs"
40
 
41
+ DEMO_TEXT_PROMPTS = ["Jazz music with drum beats.",
42
+ "A captivating classical piano performance, this piece exudes a dynamic and intense atmosphere, showcasing intricate and expressive instrumental artistry.",
43
+ "A soothing instrumental piece blending elements of light music and pop, featuring a gentle guitar rendition. The overall feel is serene and reflective, likely instrumental with no vocals.",
44
+ "The instrumental rock piece features dynamic oscillations and wave-like progressions, creating an immersive and energetic atmosphere. The music is purely instrumental, with no vocals, and it blends elements of rock and post-rock for a powerful and evocative experience.",
45
+ "The classical instrumental piece exudes a haunting and evocative atmosphere, characterized by its intricate guitar work and profound emotional depth.",
46
+ "Experience a dynamic blend of instrumental electronic music with futuristic house vibes, featuring energetic beats and a captivating rhythm. The tracks are likely instrumental, focusing on the immersive soundscapes rather than vocal performances."]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  def generate_filename():
49
+ hash_object = hashlib.sha256(str(int(datetime.datetime.now().timestamp())).encode())
50
+ hash_string = hash_object.hexdigest()
51
+ return hash_string
52
+
53
+ def get_args(
54
+ task, text="", audio=None, model_name="InspireMusic-Base",
55
+ chorus="intro",
56
+ output_sample_rate=48000, max_generate_audio_seconds=30.0, time_start = 0.0, time_end=30.0, trim=False):
57
+
58
+ if "24kHz" in model_name:
59
+ output_sample_rate = 24000
60
+
61
+ if output_sample_rate == 24000:
62
+ fast = True
63
+ else:
64
+ fast = False
65
+ # This function constructs the arguments required for InspireMusic
66
+ args = {
67
+ "task" : task,
68
+ "text" : text,
69
+ "audio_prompt" : audio,
70
+ "model_name" : model_name,
71
+ "chorus" : chorus,
72
+ "fast" : fast,
73
+ "fade_out" : True,
74
+ "trim" : trim,
75
+ "output_sample_rate" : output_sample_rate,
76
+ "min_generate_audio_seconds": 10.0,
77
+ "max_generate_audio_seconds": max_generate_audio_seconds,
78
+ "max_audio_prompt_length": 5.0,
79
+ "model_dir" : os.path.join("pretrained_models",
80
+ model_name),
81
+ "result_dir" : OUTPUT_AUDIO_DIR,
82
+ "output_fn" : generate_filename(),
83
+ "format" : "wav",
84
+ "time_start" : time_start,
85
+ "time_end": time_end,
86
+ "fade_out_duration": 1.0,
87
+ }
88
+
89
+ if args["time_start"] is None:
90
+ args["time_start"] = 0.0
91
+ args["time_end"] = args["time_start"] + args["max_generate_audio_seconds"]
92
+
93
+ print(args)
94
+ return args
95
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  def trim_audio(audio_file, cut_seconds=5):
98
+ audio, sr = torchaudio.load(audio_file)
99
+ num_samples = cut_seconds * sr
100
+ cutted_audio = audio[:, :num_samples]
101
+ output_path = os.path.join(AUDIO_PROMPT_DIR, "audio_prompt_" + generate_filename() + ".wav")
102
+ torchaudio.save(output_path, cutted_audio, sr)
103
+ return output_path
104
+
105
+ @spaces.GPU(duration=120)
106
+ def music_generation(args):
107
+ set_env_variables()
108
+ model = InspireMusicUnified(
109
+ model_name=args["model_name"],
110
+ model_dir=args["model_dir"],
111
+ min_generate_audio_seconds=args["min_generate_audio_seconds"],
112
+ max_generate_audio_seconds=args["max_generate_audio_seconds"],
113
+ sample_rate=24000,
114
+ output_sample_rate=args["output_sample_rate"],
115
+ load_jit=True,
116
+ load_onnx=False,
117
+ fast=args["fast"],
118
+ result_dir=args["result_dir"])
119
+
120
+ output_path = model.inference(
121
+ task=args["task"],
122
+ text=args["text"],
123
+ audio_prompt=args["audio_prompt"],
124
+ chorus=args["chorus"],
125
+ time_start=args["time_start"],
126
+ time_end=args["time_end"],
127
+ output_fn=args["output_fn"],
128
+ max_audio_prompt_length=args["max_audio_prompt_length"],
129
+ fade_out_duration=args["fade_out_duration"],
130
+ output_format=args["format"],
131
+ fade_out_mode=args["fade_out"],
132
+ trim=args["trim"])
133
+ return output_path
134
+
135
+
136
+ def demo_inspiremusic_t2m(text, model_name, chorus,
137
+ output_sample_rate, max_generate_audio_seconds):
138
+ args = get_args(
139
+ task='text-to-music', text=text, audio=None,
140
+ model_name=model_name, chorus=chorus,
141
+ output_sample_rate=output_sample_rate,
142
+ max_generate_audio_seconds=max_generate_audio_seconds)
143
+ return music_generation(args)
144
+
145
+ def demo_inspiremusic_con(text, audio, model_name, chorus,
146
+ output_sample_rate, max_generate_audio_seconds):
147
+ args = get_args(
148
+ task='continuation', text=text, audio=trim_audio(audio, cut_seconds=5),
149
+ model_name=model_name, chorus=chorus,
150
+ output_sample_rate=output_sample_rate,
151
+ max_generate_audio_seconds=max_generate_audio_seconds)
152
+ return music_generation(args)
153
 
154
  def main():
155
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
156
+ gr.Markdown("""
157
+ # InspireMusic
158
+ - Support music generation tasks with long-form and high audio quality, sampling rates up to 48kHz.
159
+ - Github: https://github.com/FunAudioLLM/InspireMusic/ | ModelScope Studio: https://modelscope.cn/studios/iic/InspireMusic
160
+ - Available music generation models: [InspireMusic-1.5B-Long](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-Long), [InspireMusic-1.5B](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B), [InspireMusic-Base](https://huggingface.co/FunAudioLLM/InspireMusic-Base), [InspireMusic-1.5B-24kHz](https://huggingface.co/FunAudioLLM/InspireMusic-1.5B-24kHz), [InspireMusic-Base-24kHz](https://huggingface.co/FunAudioLLM/InspireMusic-Base-24kHz). Both on Huggingface and ModelScope.
161
+ - Currently only support English text prompts.
162
+ - This page is for demo purpose, if you want to generate long-form audio, e.g., 5mins, please try to deploy locally. Thank you for your support.
163
+ """)
164
+
165
+ with gr.Row(equal_height=True):
166
+ model_name = gr.Dropdown(
167
+ MODELS, label="Select Model Name",
168
+ value="InspireMusic-1.5B-Long")
169
+ chorus = gr.Dropdown(["intro", "verse", "chorus", "outro"],
170
+ label="Chorus Mode", value="intro")
171
+ output_sample_rate = gr.Dropdown([48000, 24000],
172
+ label="Output Audio Sample Rate (Hz)",
173
+ value=48000)
174
+ max_generate_audio_seconds = gr.Slider(10, 300,
175
+ label="Generate Audio Length (s)",
176
+ value=30)
177
+
178
+ with gr.Row(equal_height=True):
179
+ text_input = gr.Textbox(label="Input Text (For Text-to-Music Task)",
180
+ value="Experience soothing and sensual instrumental jazz with a touch of Bossa Nova, perfect for a relaxing restaurant or spa ambiance.")
181
+
182
+ audio_input = gr.Audio(
183
+ label="Input Audio Prompt (For Music Continuation Task)",
184
+ type="filepath")
185
+ music_output = gr.Audio(label="Generated Music", type="filepath", autoplay=True, show_download_button = True)
186
+
187
+ with gr.Row():
188
+ button = gr.Button("Start Text-to-Music Task")
189
+ button.click(demo_inspiremusic_t2m,
190
+ inputs=[text_input, model_name,
191
+ chorus,
192
+ output_sample_rate,
193
+ max_generate_audio_seconds],
194
+ outputs=music_output)
195
+
196
+ generate_button = gr.Button("Start Music Continuation Task")
197
+ generate_button.click(demo_inspiremusic_con,
198
+ inputs=[text_input, audio_input, model_name,
199
+ chorus,
200
+ output_sample_rate,
201
+ max_generate_audio_seconds],
202
+ outputs=music_output)
203
+ t2m_examples = gr.Examples(examples=DEMO_TEXT_PROMPTS, inputs=[text_input])
204
+ demo.launch()
205
+
206
+ if __name__ == '__main__':
207
+ os.makedirs(AUDIO_PROMPT_DIR, exist_ok=True)
208
+ os.makedirs(OUTPUT_AUDIO_DIR, exist_ok=True)
209
+ main()