YuE-music-generator-demo-zero

Paused

App Files Files Community

KingNish commited on Jan 30

Commit

e6c9d72

verified ·

1 Parent(s): 310fd77

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -37

app.py CHANGED Viewed

@@ -15,6 +15,8 @@ subprocess.run(
     "pip install flash-attn --no-build-isolation",
     env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
     shell=True,
 )
 from huggingface_hub import snapshot_download
@@ -75,16 +77,9 @@ device = "cuda:0"
 model = AutoModelForCausalLM.from_pretrained(
     "m-a-p/YuE-s1-7B-anneal-en-cot",
     torch_dtype=torch.float16,
-    attn_implementation="flash_attention_2",  # To enable flashattn, you have to install flash-attn
 ).to(device)
-# assistant_model = AutoModelForCausalLM.from_pretrained(
-#     "m-a-p/YuE-s2-1B-general",
-#     torch_dtype=torch.float16,
-#     attn_implementation="flash_attention_2",  # To enable flashattn, you have to install flash-attn
-# ).to(device)
-# assistant_model = torch.compile(assistant_model)
-# model = torch.compile(model)
-# assistant_model.eval()
 model.eval()
 basic_model_config = './xcodec_mini_infer/final_ckpt/config.yaml'
@@ -130,7 +125,7 @@ def generate_music(
         raise FileNotFoundError("Please offer audio prompt filepath using '--audio_prompt_path', when you enable 'use_audio_prompt'!")
     cuda_idx = cuda_idx
     max_new_tokens = max_new_tokens * 100
     with tempfile.TemporaryDirectory() as output_dir:
         stage1_output_dir = os.path.join(output_dir, f"stage1")
         os.makedirs(stage1_output_dir, exist_ok=True)
@@ -234,7 +229,7 @@ def generate_music(
                     pad_token_id=mmtokenizer.eoa,
                     logits_processor=LogitsProcessorList([BlockTokenRangeProcessor(0, 32002), BlockTokenRangeProcessor(32016, 32016)]),
                     guidance_scale=guidance_scale,
-                    use_cache=True,
                     top_k=50,
                     num_beams=1
                 )
@@ -268,14 +263,14 @@ def generate_music(
             instrumentals.append(instrumentals_ids)
         vocals = np.concatenate(vocals, axis=1)
         instrumentals = np.concatenate(instrumentals, axis=1)
         vocal_save_path = os.path.join(stage1_output_dir, f"vocal_{random_id}".replace('.', '@') + '.npy')
         inst_save_path = os.path.join(stage1_output_dir, f"instrumental_{random_id}".replace('.', '@') + '.npy')
         np.save(vocal_save_path, vocals)
         np.save(inst_save_path, instrumentals)
         stage1_output_set.append(vocal_save_path)
         stage1_output_set.append(inst_save_path)
         print("Converting to Audio...")
@@ -374,7 +369,7 @@ def generate_music(
             cutoff_freq=5500.0
         )
         print("All process Done")
         # Load the final audio file and return the numpy array
         final_audio, sr = torchaudio.load(final_output_path)
         return (sr, final_audio.squeeze().numpy())
@@ -402,7 +397,7 @@ with gr.Blocks() as demo:
         <div style="display:flex;column-gap:4px;">
             <a href="https://github.com/multimodal-art-projection/YuE">
                 <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
-            </a>
             <a href="https://map-yue.github.io">
                 <img src='https://img.shields.io/badge/Project-Page-green'>
             </a>
@@ -418,32 +413,11 @@ with gr.Blocks() as demo:
             with gr.Column():
                 num_segments = gr.Number(label="Number of Segments", value=2, interactive=True)
-                max_new_tokens = gr.Slider(label="Duration of song", minimum=1, maximum=30, step=1, value=5,
                                            interactive=True)
                 submit_btn = gr.Button("Submit")
                 music_out = gr.Audio(label="Audio Result")
-#         gr.Examples(
-#             examples=[
-#                 ["Rap, Hip-Hop, Street Vibes, Tough, Piercing Vocals, Piano, Synthesizer, Clear Male Vocals",
-#                 """[verse]
-# Woke up in the morning, sun is shining bright
-# Chasing all my dreams, gotta get my mind right
-# City lights are fading, but my vision's clear
-# Got my team beside me, no room for fear
-# Walking through the streets, beats inside my head
-# Every step I take, closer to the bread
-# People passing by, they don't understand
-# Building up my future with my own two hands
-#                 """],
-#             ],
-#             inputs=[genre_txt, lyrics_txt],
-#             outputs=[music_out],
-#             cache_examples=True,
-#             cache_mode="eager",
-#             fn=infer
-#         )
         gr.Examples(
             examples=[
                 [

     "pip install flash-attn --no-build-isolation",
     env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
     shell=True,
+    capture_output=True,  # Capture output for debugging
+    text=True  # Decode output as text
 )
 from huggingface_hub import snapshot_download
 model = AutoModelForCausalLM.from_pretrained(
     "m-a-p/YuE-s1-7B-anneal-en-cot",
     torch_dtype=torch.float16,
+    attn_implementation="flash_attention_2",
+    load_in_4bit=True # Or load_in_8bit=True
 ).to(device)
 model.eval()
 basic_model_config = './xcodec_mini_infer/final_ckpt/config.yaml'
         raise FileNotFoundError("Please offer audio prompt filepath using '--audio_prompt_path', when you enable 'use_audio_prompt'!")
     cuda_idx = cuda_idx
     max_new_tokens = max_new_tokens * 100
     with tempfile.TemporaryDirectory() as output_dir:
         stage1_output_dir = os.path.join(output_dir, f"stage1")
         os.makedirs(stage1_output_dir, exist_ok=True)
                     pad_token_id=mmtokenizer.eoa,
                     logits_processor=LogitsProcessorList([BlockTokenRangeProcessor(0, 32002), BlockTokenRangeProcessor(32016, 32016)]),
                     guidance_scale=guidance_scale,
+                    use_cache=True, # KV Caching is enabled here!
                     top_k=50,
                     num_beams=1
                 )
             instrumentals.append(instrumentals_ids)
         vocals = np.concatenate(vocals, axis=1)
         instrumentals = np.concatenate(instrumentals, axis=1)
         vocal_save_path = os.path.join(stage1_output_dir, f"vocal_{random_id}".replace('.', '@') + '.npy')
         inst_save_path = os.path.join(stage1_output_dir, f"instrumental_{random_id}".replace('.', '@') + '.npy')
         np.save(vocal_save_path, vocals)
         np.save(inst_save_path, instrumentals)
         stage1_output_set.append(vocal_save_path)
         stage1_output_set.append(inst_save_path)
         print("Converting to Audio...")
             cutoff_freq=5500.0
         )
         print("All process Done")
         # Load the final audio file and return the numpy array
         final_audio, sr = torchaudio.load(final_output_path)
         return (sr, final_audio.squeeze().numpy())
         <div style="display:flex;column-gap:4px;">
             <a href="https://github.com/multimodal-art-projection/YuE">
                 <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
+            </a>
             <a href="https://map-yue.github.io">
                 <img src='https://img.shields.io/badge/Project-Page-green'>
             </a>
             with gr.Column():
                 num_segments = gr.Number(label="Number of Segments", value=2, interactive=True)
+                max_new_tokens = gr.Slider(label="Duration of song", minimum=1, maximum=30, step=1, value=15,
                                            interactive=True)
                 submit_btn = gr.Button("Submit")
                 music_out = gr.Audio(label="Audio Result")
         gr.Examples(
             examples=[
                 [