YuE-music-generator-demo-zero

Paused

App Files Files Community

KingNish commited on Feb 2

Commit

4c600ac

verified ·

1 Parent(s): be4c769

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -29

app.py CHANGED Viewed

@@ -46,7 +46,6 @@ except FileNotFoundError:
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer'))
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer', 'descriptaudiocodec'))
 # don't change above code
 import argparse
@@ -68,13 +67,12 @@ import copy
 from collections import Counter
 from models.soundstream_hubert_new import SoundStream
 device = "cuda:0"
 # Load model and tokenizer outside the generation function (load once)
 print("Loading model...")
 model = AutoModelForCausalLM.from_pretrained(
-    "m-a-p/YuE-s1-7B-anneal-en-cot", # "m-a-p/YuE-s1-7B-anneal-en-icl",
     torch_dtype=torch.float16,
     attn_implementation="flash_attention_2",
 ).to(device)
@@ -139,7 +137,7 @@ def generate_music(
     model inference, and audio post-processing.
     """
     if use_audio_prompt and not audio_prompt_path:
-        raise FileNotFoundError("Please offer audio prompt filepath using '--audio_prompt_path', when you enable 'use_audio_prompt'!")
     cuda_idx = cuda_idx
     max_new_tokens = max_new_tokens * 100
@@ -147,12 +145,11 @@ def generate_music(
         stage1_output_dir = os.path.join(output_dir, f"stage1")
         os.makedirs(stage1_output_dir, exist_ok=True)
         stage1_output_set = []
         genres = genre_txt.strip()
         lyrics = split_lyrics(lyrics_txt + "\n")
-        # intruction
         full_lyrics = "\n".join(lyrics)
         prompt_texts = [f"Generate music from the given lyrics segment by segment.\n[Genre] {genres}\n{full_lyrics}"]
         prompt_texts += lyrics
@@ -160,14 +157,13 @@ def generate_music(
         random_id = uuid.uuid4()
         raw_output = None
-        # Decoding config (moved here for better readability)
         top_p = 0.93
         temperature = 1.0
         repetition_penalty = 1.2
         start_of_segment = mmtokenizer.tokenize('[start_of_segment]')
         end_of_segment = mmtokenizer.tokenize('[end_of_segment]')
         # Format text prompt
         run_n_segments = min(run_n_segments + 1, len(lyrics))
@@ -175,7 +171,7 @@ def generate_music(
         for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
             section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
-            guidance_scale = 1.5 if i <= 1 else 1.2 # Guidance scale adjusted based on segment index
             if i == 0:
                 continue
             if i == 1:
@@ -213,13 +209,12 @@ def generate_music(
             def model_inference(input_ids, max_new_tokens, top_p, temperature, repetition_penalty, guidance_scale):
                 """
                 Performs model inference to generate music tokens.
-                This function is decorated with @spaces.GPU for GPU usage in Gradio Spaces.
                 """
                 with torch.inference_mode(), torch.autocast(device_type='cuda', dtype=torch.float16):
                     output_seq = model.generate(
                         input_ids=input_ids,
                         max_new_tokens=max_new_tokens,
-                        min_new_tokens=100, # Keep min_new_tokens to avoid short generations
                         do_sample=True,
                         top_p=top_p,
                         temperature=temperature,
@@ -234,7 +229,7 @@ def generate_music(
                         tensor_eoa = torch.as_tensor([[mmtokenizer.eoa]]).to(model.device)
                         output_seq = torch.cat((output_seq, tensor_eoa), dim=1)
                 return output_seq
             output_seq = model_inference(input_ids, max_new_tokens, top_p, temperature, repetition_penalty, guidance_scale)
             if i > 1:
@@ -257,7 +252,7 @@ def generate_music(
             codec_ids = ids[soa_idx[i] + 1:eoa_idx[i]]
             if codec_ids[0] == 32016:
                 codec_ids = codec_ids[1:]
-            codec_ids = codec_ids[:2 * (codec_ids.shape[0] // 2)] # Ensure even length for reshape
             vocals_ids = codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[0])
             vocals.append(vocals_ids)
             instrumentals_ids = codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[1])
@@ -294,19 +289,17 @@ def generate_music(
             decodec_rlt = []
             with torch.no_grad():
                 decoded_waveform = codec_model.decode(
-                    torch.as_tensor(codec_result.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(
-                        device))
             decoded_waveform = decoded_waveform.cpu().squeeze(0)
             decodec_rlt.append(torch.as_tensor(decoded_waveform))
             decodec_rlt = torch.cat(decodec_rlt, dim=-1)
-            save_path = os.path.join(recons_output_dir, os.path.splitext(os.path.basename(npy))[0] + ".mp3") # Save as mp3 for gradio
             tracks.append(save_path)
             save_audio(decodec_rlt, save_path, 16000)
         # mix tracks
         for inst_path in tracks:
             try:
-                if (inst_path.endswith('.wav') or inst_path.endswith('.mp3')) \
-                        and 'instrumental' in inst_path:
                     # find pair
                     vocal_path = inst_path.replace('instrumental', 'vocal')
                     if not os.path.exists(vocal_path):
@@ -321,7 +314,6 @@ def generate_music(
                 print(e)
                 return None, None, None
 # Gradio Interface
 with gr.Blocks() as demo:
     with gr.Column():
@@ -343,17 +335,33 @@ with gr.Blocks() as demo:
             with gr.Column():
                 genre_txt = gr.Textbox(label="Genre")
                 lyrics_txt = gr.Textbox(label="Lyrics")
             with gr.Column():
                 num_segments = gr.Number(label="Number of Segments", value=2, interactive=True)
                 max_new_tokens = gr.Slider(label="Duration of song", minimum=1, maximum=30, step=1, value=15, interactive=True)
                 submit_btn = gr.Button("Submit")
                 music_out = gr.Audio(label="Mixed Audio Result")
                 with gr.Accordion(label="Vocal and Instrumental Result", open=False):
                     vocal_out = gr.Audio(label="Vocal Audio")
                     instrumental_out = gr.Audio(label="Instrumental Audio")
         gr.Examples(
             examples=[
                 [
@@ -400,11 +408,4 @@ Locked inside my mind, hot flame.
             fn=generate_music
         )
-    submit_btn.click(
-        fn=generate_music,
-        inputs=[genre_txt, lyrics_txt, num_segments, max_new_tokens],
-        outputs=[music_out, vocal_out, instrumental_out]
-    )
-    gr.Markdown("## Call for Contributions\nIf you find this space interesting please feel free to contribute.")
-demo.queue().launch(show_error=True)

 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer'))
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer', 'descriptaudiocodec'))
 # don't change above code
 import argparse
 from collections import Counter
 from models.soundstream_hubert_new import SoundStream
 device = "cuda:0"
 # Load model and tokenizer outside the generation function (load once)
 print("Loading model...")
 model = AutoModelForCausalLM.from_pretrained(
+    "m-a-p/YuE-s1-7B-anneal-en-cot",  # "m-a-p/YuE-s1-7B-anneal-en-icl",
     torch_dtype=torch.float16,
     attn_implementation="flash_attention_2",
 ).to(device)
     model inference, and audio post-processing.
     """
     if use_audio_prompt and not audio_prompt_path:
+        raise FileNotFoundError("Please provide an audio prompt file when 'Use Audio Prompt' is enabled!")
     cuda_idx = cuda_idx
     max_new_tokens = max_new_tokens * 100
         stage1_output_dir = os.path.join(output_dir, f"stage1")
         os.makedirs(stage1_output_dir, exist_ok=True)
         stage1_output_set = []
         genres = genre_txt.strip()
         lyrics = split_lyrics(lyrics_txt + "\n")
+        # instruction
         full_lyrics = "\n".join(lyrics)
         prompt_texts = [f"Generate music from the given lyrics segment by segment.\n[Genre] {genres}\n{full_lyrics}"]
         prompt_texts += lyrics
         random_id = uuid.uuid4()
         raw_output = None
+        # Decoding config
         top_p = 0.93
         temperature = 1.0
         repetition_penalty = 1.2
         start_of_segment = mmtokenizer.tokenize('[start_of_segment]')
         end_of_segment = mmtokenizer.tokenize('[end_of_segment]')
         # Format text prompt
         run_n_segments = min(run_n_segments + 1, len(lyrics))
         for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
             section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
+            guidance_scale = 1.5 if i <= 1 else 1.2  # Guidance scale adjusted based on segment index
             if i == 0:
                 continue
             if i == 1:
             def model_inference(input_ids, max_new_tokens, top_p, temperature, repetition_penalty, guidance_scale):
                 """
                 Performs model inference to generate music tokens.
                 """
                 with torch.inference_mode(), torch.autocast(device_type='cuda', dtype=torch.float16):
                     output_seq = model.generate(
                         input_ids=input_ids,
                         max_new_tokens=max_new_tokens,
+                        min_new_tokens=100,  # Keep min_new_tokens to avoid short generations
                         do_sample=True,
                         top_p=top_p,
                         temperature=temperature,
                         tensor_eoa = torch.as_tensor([[mmtokenizer.eoa]]).to(model.device)
                         output_seq = torch.cat((output_seq, tensor_eoa), dim=1)
                 return output_seq
             output_seq = model_inference(input_ids, max_new_tokens, top_p, temperature, repetition_penalty, guidance_scale)
             if i > 1:
             codec_ids = ids[soa_idx[i] + 1:eoa_idx[i]]
             if codec_ids[0] == 32016:
                 codec_ids = codec_ids[1:]
+            codec_ids = codec_ids[:2 * (len(codec_ids) // 2)]  # Ensure even length for reshape
             vocals_ids = codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[0])
             vocals.append(vocals_ids)
             instrumentals_ids = codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[1])
             decodec_rlt = []
             with torch.no_grad():
                 decoded_waveform = codec_model.decode(
+                    torch.as_tensor(codec_result.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(device))
             decoded_waveform = decoded_waveform.cpu().squeeze(0)
             decodec_rlt.append(torch.as_tensor(decoded_waveform))
             decodec_rlt = torch.cat(decodec_rlt, dim=-1)
+            save_path = os.path.join(recons_output_dir, os.path.splitext(os.path.basename(npy))[0] + ".mp3")  # Save as mp3 for gradio
             tracks.append(save_path)
             save_audio(decodec_rlt, save_path, 16000)
         # mix tracks
         for inst_path in tracks:
             try:
+                if (inst_path.endswith('.wav') or inst_path.endswith('.mp3')) and 'instrumental' in inst_path:
                     # find pair
                     vocal_path = inst_path.replace('instrumental', 'vocal')
                     if not os.path.exists(vocal_path):
                 print(e)
                 return None, None, None
 # Gradio Interface
 with gr.Blocks() as demo:
     with gr.Column():
             with gr.Column():
                 genre_txt = gr.Textbox(label="Genre")
                 lyrics_txt = gr.Textbox(label="Lyrics")
+                use_audio_prompt = gr.Checkbox(label="Use Audio Prompt?", value=False)
+                audio_prompt_input = gr.Audio(source="upload", type="filepath", label="Audio Prompt (Optional)")
             with gr.Column():
                 num_segments = gr.Number(label="Number of Segments", value=2, interactive=True)
                 max_new_tokens = gr.Slider(label="Duration of song", minimum=1, maximum=30, step=1, value=15, interactive=True)
                 submit_btn = gr.Button("Submit")
                 music_out = gr.Audio(label="Mixed Audio Result")
                 with gr.Accordion(label="Vocal and Instrumental Result", open=False):
                     vocal_out = gr.Audio(label="Vocal Audio")
                     instrumental_out = gr.Audio(label="Instrumental Audio")
+        gr.Markdown("## Call for Contributions\nIf you find this space interesting please feel free to contribute.")
+        # When the "Submit" button is clicked, pass the additional audio-related inputs to the function.
+        submit_btn.click(
+            fn=generate_music,
+            inputs=[
+                genre_txt,
+                lyrics_txt,
+                num_segments,
+                max_new_tokens,
+                use_audio_prompt,
+                audio_prompt_input,
+            ],
+            outputs=[music_out, vocal_out, instrumental_out]
+        )
+        # Examples updated to only include text inputs
         gr.Examples(
             examples=[
                 [
             fn=generate_music
         )
+demo.queue().launch(show_error=True)