Update app.py
Browse files
app.py
CHANGED
@@ -15,6 +15,8 @@ subprocess.run(
|
|
15 |
"pip install flash-attn --no-build-isolation",
|
16 |
env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
|
17 |
shell=True,
|
|
|
|
|
18 |
)
|
19 |
|
20 |
from huggingface_hub import snapshot_download
|
@@ -75,16 +77,9 @@ device = "cuda:0"
|
|
75 |
model = AutoModelForCausalLM.from_pretrained(
|
76 |
"m-a-p/YuE-s1-7B-anneal-en-cot",
|
77 |
torch_dtype=torch.float16,
|
78 |
-
attn_implementation="flash_attention_2",
|
|
|
79 |
).to(device)
|
80 |
-
# assistant_model = AutoModelForCausalLM.from_pretrained(
|
81 |
-
# "m-a-p/YuE-s2-1B-general",
|
82 |
-
# torch_dtype=torch.float16,
|
83 |
-
# attn_implementation="flash_attention_2", # To enable flashattn, you have to install flash-attn
|
84 |
-
# ).to(device)
|
85 |
-
# assistant_model = torch.compile(assistant_model)
|
86 |
-
# model = torch.compile(model)
|
87 |
-
# assistant_model.eval()
|
88 |
model.eval()
|
89 |
|
90 |
basic_model_config = './xcodec_mini_infer/final_ckpt/config.yaml'
|
@@ -130,7 +125,7 @@ def generate_music(
|
|
130 |
raise FileNotFoundError("Please offer audio prompt filepath using '--audio_prompt_path', when you enable 'use_audio_prompt'!")
|
131 |
cuda_idx = cuda_idx
|
132 |
max_new_tokens = max_new_tokens * 100
|
133 |
-
|
134 |
with tempfile.TemporaryDirectory() as output_dir:
|
135 |
stage1_output_dir = os.path.join(output_dir, f"stage1")
|
136 |
os.makedirs(stage1_output_dir, exist_ok=True)
|
@@ -234,7 +229,7 @@ def generate_music(
|
|
234 |
pad_token_id=mmtokenizer.eoa,
|
235 |
logits_processor=LogitsProcessorList([BlockTokenRangeProcessor(0, 32002), BlockTokenRangeProcessor(32016, 32016)]),
|
236 |
guidance_scale=guidance_scale,
|
237 |
-
use_cache=True,
|
238 |
top_k=50,
|
239 |
num_beams=1
|
240 |
)
|
@@ -268,14 +263,14 @@ def generate_music(
|
|
268 |
instrumentals.append(instrumentals_ids)
|
269 |
vocals = np.concatenate(vocals, axis=1)
|
270 |
instrumentals = np.concatenate(instrumentals, axis=1)
|
271 |
-
|
272 |
vocal_save_path = os.path.join(stage1_output_dir, f"vocal_{random_id}".replace('.', '@') + '.npy')
|
273 |
inst_save_path = os.path.join(stage1_output_dir, f"instrumental_{random_id}".replace('.', '@') + '.npy')
|
274 |
np.save(vocal_save_path, vocals)
|
275 |
np.save(inst_save_path, instrumentals)
|
276 |
stage1_output_set.append(vocal_save_path)
|
277 |
stage1_output_set.append(inst_save_path)
|
278 |
-
|
279 |
|
280 |
print("Converting to Audio...")
|
281 |
|
@@ -374,7 +369,7 @@ def generate_music(
|
|
374 |
cutoff_freq=5500.0
|
375 |
)
|
376 |
print("All process Done")
|
377 |
-
|
378 |
# Load the final audio file and return the numpy array
|
379 |
final_audio, sr = torchaudio.load(final_output_path)
|
380 |
return (sr, final_audio.squeeze().numpy())
|
@@ -402,7 +397,7 @@ with gr.Blocks() as demo:
|
|
402 |
<div style="display:flex;column-gap:4px;">
|
403 |
<a href="https://github.com/multimodal-art-projection/YuE">
|
404 |
<img src='https://img.shields.io/badge/GitHub-Repo-blue'>
|
405 |
-
</a>
|
406 |
<a href="https://map-yue.github.io">
|
407 |
<img src='https://img.shields.io/badge/Project-Page-green'>
|
408 |
</a>
|
@@ -418,32 +413,11 @@ with gr.Blocks() as demo:
|
|
418 |
|
419 |
with gr.Column():
|
420 |
num_segments = gr.Number(label="Number of Segments", value=2, interactive=True)
|
421 |
-
max_new_tokens = gr.Slider(label="Duration of song", minimum=1, maximum=30, step=1, value=
|
422 |
interactive=True)
|
423 |
submit_btn = gr.Button("Submit")
|
424 |
music_out = gr.Audio(label="Audio Result")
|
425 |
|
426 |
-
# gr.Examples(
|
427 |
-
# examples=[
|
428 |
-
# ["Rap, Hip-Hop, Street Vibes, Tough, Piercing Vocals, Piano, Synthesizer, Clear Male Vocals",
|
429 |
-
# """[verse]
|
430 |
-
# Woke up in the morning, sun is shining bright
|
431 |
-
# Chasing all my dreams, gotta get my mind right
|
432 |
-
# City lights are fading, but my vision's clear
|
433 |
-
# Got my team beside me, no room for fear
|
434 |
-
# Walking through the streets, beats inside my head
|
435 |
-
# Every step I take, closer to the bread
|
436 |
-
# People passing by, they don't understand
|
437 |
-
# Building up my future with my own two hands
|
438 |
-
# """],
|
439 |
-
# ],
|
440 |
-
# inputs=[genre_txt, lyrics_txt],
|
441 |
-
# outputs=[music_out],
|
442 |
-
# cache_examples=True,
|
443 |
-
# cache_mode="eager",
|
444 |
-
# fn=infer
|
445 |
-
# )
|
446 |
-
|
447 |
gr.Examples(
|
448 |
examples=[
|
449 |
[
|
|
|
15 |
"pip install flash-attn --no-build-isolation",
|
16 |
env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
|
17 |
shell=True,
|
18 |
+
capture_output=True, # Capture output for debugging
|
19 |
+
text=True # Decode output as text
|
20 |
)
|
21 |
|
22 |
from huggingface_hub import snapshot_download
|
|
|
77 |
model = AutoModelForCausalLM.from_pretrained(
|
78 |
"m-a-p/YuE-s1-7B-anneal-en-cot",
|
79 |
torch_dtype=torch.float16,
|
80 |
+
attn_implementation="flash_attention_2",
|
81 |
+
load_in_4bit=True # Or load_in_8bit=True
|
82 |
).to(device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
model.eval()
|
84 |
|
85 |
basic_model_config = './xcodec_mini_infer/final_ckpt/config.yaml'
|
|
|
125 |
raise FileNotFoundError("Please offer audio prompt filepath using '--audio_prompt_path', when you enable 'use_audio_prompt'!")
|
126 |
cuda_idx = cuda_idx
|
127 |
max_new_tokens = max_new_tokens * 100
|
128 |
+
|
129 |
with tempfile.TemporaryDirectory() as output_dir:
|
130 |
stage1_output_dir = os.path.join(output_dir, f"stage1")
|
131 |
os.makedirs(stage1_output_dir, exist_ok=True)
|
|
|
229 |
pad_token_id=mmtokenizer.eoa,
|
230 |
logits_processor=LogitsProcessorList([BlockTokenRangeProcessor(0, 32002), BlockTokenRangeProcessor(32016, 32016)]),
|
231 |
guidance_scale=guidance_scale,
|
232 |
+
use_cache=True, # KV Caching is enabled here!
|
233 |
top_k=50,
|
234 |
num_beams=1
|
235 |
)
|
|
|
263 |
instrumentals.append(instrumentals_ids)
|
264 |
vocals = np.concatenate(vocals, axis=1)
|
265 |
instrumentals = np.concatenate(instrumentals, axis=1)
|
266 |
+
|
267 |
vocal_save_path = os.path.join(stage1_output_dir, f"vocal_{random_id}".replace('.', '@') + '.npy')
|
268 |
inst_save_path = os.path.join(stage1_output_dir, f"instrumental_{random_id}".replace('.', '@') + '.npy')
|
269 |
np.save(vocal_save_path, vocals)
|
270 |
np.save(inst_save_path, instrumentals)
|
271 |
stage1_output_set.append(vocal_save_path)
|
272 |
stage1_output_set.append(inst_save_path)
|
273 |
+
|
274 |
|
275 |
print("Converting to Audio...")
|
276 |
|
|
|
369 |
cutoff_freq=5500.0
|
370 |
)
|
371 |
print("All process Done")
|
372 |
+
|
373 |
# Load the final audio file and return the numpy array
|
374 |
final_audio, sr = torchaudio.load(final_output_path)
|
375 |
return (sr, final_audio.squeeze().numpy())
|
|
|
397 |
<div style="display:flex;column-gap:4px;">
|
398 |
<a href="https://github.com/multimodal-art-projection/YuE">
|
399 |
<img src='https://img.shields.io/badge/GitHub-Repo-blue'>
|
400 |
+
</a>
|
401 |
<a href="https://map-yue.github.io">
|
402 |
<img src='https://img.shields.io/badge/Project-Page-green'>
|
403 |
</a>
|
|
|
413 |
|
414 |
with gr.Column():
|
415 |
num_segments = gr.Number(label="Number of Segments", value=2, interactive=True)
|
416 |
+
max_new_tokens = gr.Slider(label="Duration of song", minimum=1, maximum=30, step=1, value=15,
|
417 |
interactive=True)
|
418 |
submit_btn = gr.Button("Submit")
|
419 |
music_out = gr.Audio(label="Audio Result")
|
420 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
421 |
gr.Examples(
|
422 |
examples=[
|
423 |
[
|