YuE-music-generator-demo-zero

Paused

App Files Files Community

KingNish commited on Feb 2

Commit

cc4d053

verified ·

1 Parent(s): 97d54d7

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -39

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ import torch
 import sys
 import uuid
 import re
 print("Installing flash-attn...")
 # Install flash attention
@@ -133,23 +134,19 @@ def generate_music(
 ):
     """
     Generates music based on given genre and lyrics, optionally using an audio prompt.
-    This function orchestrates the music generation process, including prompt formatting,
-    model inference, and audio post-processing.
     """
     if use_audio_prompt and not audio_prompt_path:
         raise FileNotFoundError("Please provide an audio prompt file when 'Use Audio Prompt' is enabled!")
-    cuda_idx = cuda_idx
     max_new_tokens = max_new_tokens * 100
     with tempfile.TemporaryDirectory() as output_dir:
         stage1_output_dir = os.path.join(output_dir, f"stage1")
         os.makedirs(stage1_output_dir, exist_ok=True)
         stage1_output_set = []
         genres = genre_txt.strip()
         lyrics = split_lyrics(lyrics_txt + "\n")
-        # instruction
         full_lyrics = "\n".join(lyrics)
         prompt_texts = [f"Generate music from the given lyrics segment by segment.\n[Genre] {genres}\n{full_lyrics}"]
         prompt_texts += lyrics
@@ -157,23 +154,21 @@ def generate_music(
         random_id = uuid.uuid4()
         raw_output = None
-        # Decoding config
-        top_p = 0.93
-        temperature = 1.0
-        repetition_penalty = 1.2
-        start_of_segment = mmtokenizer.tokenize('[start_of_segment]')
-        end_of_segment = mmtokenizer.tokenize('[end_of_segment]')
-        # Format text prompt
         run_n_segments = min(run_n_segments + 1, len(lyrics))
         print(list(enumerate(tqdm(prompt_texts[:run_n_segments]))))
-        for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
             section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
-            guidance_scale = 1.5 if i <= 1 else 1.2  # Guidance scale adjusted based on segment index
             if i == 0:
-                continue
             if i == 1:
                 if use_audio_prompt:
                     audio_prompt = load_audio_mono(audio_prompt_path)
@@ -182,16 +177,13 @@ def generate_music(
                         raw_codes = codec_model.encode(audio_prompt.to(device), target_bw=0.5)
                     raw_codes = raw_codes.transpose(0, 1)
                     raw_codes = raw_codes.cpu().numpy().astype(np.int16)
-                    # Format audio prompt
-                    code_ids = codectool.npy2ids(raw_codes[0])
-                    audio_prompt_codec = code_ids[int(prompt_start_time * 50): int(prompt_end_time * 50)]  # 50 is tps of xcodec
-                    audio_prompt_codec_ids = [mmtokenizer.soa] + codectool.sep_ids + audio_prompt_codec + [
-                        mmtokenizer.eoa]
-                    sentence_ids = mmtokenizer.tokenize("[start_of_reference]") + audio_prompt_codec_ids + mmtokenizer.tokenize(
-                        "[end_of_reference]")
                     head_id = mmtokenizer.tokenize(prompt_texts[0]) + sentence_ids
                 else:
                     head_id = mmtokenizer.tokenize(prompt_texts[0])
                 prompt_ids = head_id + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
             else:
                 prompt_ids = end_of_segment + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
@@ -199,22 +191,19 @@ def generate_music(
             prompt_ids = torch.as_tensor(prompt_ids).unsqueeze(0).to(device)
             input_ids = torch.cat([raw_output, prompt_ids], dim=1) if i > 1 else prompt_ids
-            # Use window slicing in case output sequence exceeds the context of model
             max_context = 16384 - max_new_tokens - 1
             if input_ids.shape[-1] > max_context:
-                print(
-                    f'Section {i}: output length {input_ids.shape[-1]} exceeding context length {max_context}, now using the last {max_context} tokens.')
                 input_ids = input_ids[:, -(max_context):]
             with torch.inference_mode(), torch.autocast(device_type='cuda', dtype=torch.float16):
                 output_seq = model.generate(
                     input_ids=input_ids,
                     max_new_tokens=max_new_tokens,
-                    min_new_tokens=100,  # Keep min_new_tokens to avoid short generations
                     do_sample=True,
-                    top_p=top_p,
-                    temperature=temperature,
-                    repetition_penalty=repetition_penalty,
                     eos_token_id=mmtokenizer.eoa,
                     pad_token_id=mmtokenizer.eoa,
                     logits_processor=LogitsProcessorList([BlockTokenRangeProcessor(0, 32002), BlockTokenRangeProcessor(32016, 32016)]),
@@ -222,17 +211,27 @@ def generate_music(
                     use_cache=True,
                     num_beams=3
                 )
                 if output_seq[0][-1].item() != mmtokenizer.eoa:
                     tensor_eoa = torch.as_tensor([[mmtokenizer.eoa]]).to(model.device)
                     output_seq = torch.cat((output_seq, tensor_eoa), dim=1)
-            if i > 1:
-                raw_output = torch.cat([raw_output, prompt_ids, output_seq[:, input_ids.shape[-1]:]], dim=1)
-            else:
-                raw_output = output_seq
-            print(len(raw_output))
-        # save raw output and check sanity
         ids = raw_output[0].cpu().numpy()
         soa_idx = np.where(ids == mmtokenizer.soa)[0].tolist()
         eoa_idx = np.where(ids == mmtokenizer.eoa)[0].tolist()
@@ -333,7 +332,7 @@ with gr.Blocks() as demo:
                 audio_prompt_input = gr.Audio(type="filepath", label="Audio Prompt (Optional)")
             with gr.Column():
                 num_segments = gr.Number(label="Number of Segments", value=2, interactive=True)
-                max_new_tokens = gr.Slider(label="Duration of song", minimum=1, maximum=30, step=1, value=15, interactive=True)
                 submit_btn = gr.Button("Submit")
                 music_out = gr.Audio(label="Mixed Audio Result")
                 with gr.Accordion(label="Vocal and Instrumental Result", open=False):

 import sys
 import uuid
 import re
+import threading
 print("Installing flash-attn...")
 # Install flash attention
 ):
     """
     Generates music based on given genre and lyrics, optionally using an audio prompt.
+    Runs segment generation in parallel using threading.
     """
     if use_audio_prompt and not audio_prompt_path:
         raise FileNotFoundError("Please provide an audio prompt file when 'Use Audio Prompt' is enabled!")
     max_new_tokens = max_new_tokens * 100
     with tempfile.TemporaryDirectory() as output_dir:
         stage1_output_dir = os.path.join(output_dir, f"stage1")
         os.makedirs(stage1_output_dir, exist_ok=True)
         stage1_output_set = []
         genres = genre_txt.strip()
         lyrics = split_lyrics(lyrics_txt + "\n")
         full_lyrics = "\n".join(lyrics)
         prompt_texts = [f"Generate music from the given lyrics segment by segment.\n[Genre] {genres}\n{full_lyrics}"]
         prompt_texts += lyrics
         random_id = uuid.uuid4()
         raw_output = None
         run_n_segments = min(run_n_segments + 1, len(lyrics))
         print(list(enumerate(tqdm(prompt_texts[:run_n_segments]))))
+        threads = []
+        segment_outputs = [None] * run_n_segments  # Store outputs in correct order
+        def process_segment(i, p):
+            nonlocal raw_output
             section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
+            guidance_scale = 1.5 if i <= 1 else 1.2
             if i == 0:
+                return
+            prompt_ids = None
             if i == 1:
                 if use_audio_prompt:
                     audio_prompt = load_audio_mono(audio_prompt_path)
                         raw_codes = codec_model.encode(audio_prompt.to(device), target_bw=0.5)
                     raw_codes = raw_codes.transpose(0, 1)
                     raw_codes = raw_codes.cpu().numpy().astype(np.int16)
+                    audio_prompt_codec = codectool.npy2ids(raw_codes[0])
+                    audio_prompt_codec_ids = [mmtokenizer.soa] + codectool.sep_ids + audio_prompt_codec + [mmtokenizer.eoa]
+                    sentence_ids = mmtokenizer.tokenize("[start_of_reference]") + audio_prompt_codec_ids + mmtokenizer.tokenize("[end_of_reference]")
                     head_id = mmtokenizer.tokenize(prompt_texts[0]) + sentence_ids
                 else:
                     head_id = mmtokenizer.tokenize(prompt_texts[0])
                 prompt_ids = head_id + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
             else:
                 prompt_ids = end_of_segment + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
             prompt_ids = torch.as_tensor(prompt_ids).unsqueeze(0).to(device)
             input_ids = torch.cat([raw_output, prompt_ids], dim=1) if i > 1 else prompt_ids
             max_context = 16384 - max_new_tokens - 1
             if input_ids.shape[-1] > max_context:
                 input_ids = input_ids[:, -(max_context):]
             with torch.inference_mode(), torch.autocast(device_type='cuda', dtype=torch.float16):
                 output_seq = model.generate(
                     input_ids=input_ids,
                     max_new_tokens=max_new_tokens,
+                    min_new_tokens=100,
                     do_sample=True,
+                    top_p=0.93,
+                    temperature=1.0,
+                    repetition_penalty=1.2,
                     eos_token_id=mmtokenizer.eoa,
                     pad_token_id=mmtokenizer.eoa,
                     logits_processor=LogitsProcessorList([BlockTokenRangeProcessor(0, 32002), BlockTokenRangeProcessor(32016, 32016)]),
                     use_cache=True,
                     num_beams=3
                 )
                 if output_seq[0][-1].item() != mmtokenizer.eoa:
                     tensor_eoa = torch.as_tensor([[mmtokenizer.eoa]]).to(model.device)
                     output_seq = torch.cat((output_seq, tensor_eoa), dim=1)
+            segment_outputs[i] = output_seq  # Store in order
+        # Start threads
+        for i, p in enumerate(prompt_texts[:run_n_segments]):
+            thread = threading.Thread(target=process_segment, args=(i, p))
+            threads.append(thread)
+            thread.start()
+        # Wait for all threads to finish
+        for thread in threads:
+            thread.join()
+        # Combine results in order
+        raw_output = torch.cat([seg for seg in segment_outputs if seg is not None], dim=1)
+        # Save and process audio (same as before)
         ids = raw_output[0].cpu().numpy()
         soa_idx = np.where(ids == mmtokenizer.soa)[0].tolist()
         eoa_idx = np.where(ids == mmtokenizer.eoa)[0].tolist()
                 audio_prompt_input = gr.Audio(type="filepath", label="Audio Prompt (Optional)")
             with gr.Column():
                 num_segments = gr.Number(label="Number of Segments", value=2, interactive=True)
+                max_new_tokens = gr.Slider(label="Duration of song", info="on ZeroGPU max its supports is 20 seconds", minimum=1, maximum=30, step=1, value=15, interactive=True)
                 submit_btn = gr.Button("Submit")
                 music_out = gr.Audio(label="Mixed Audio Result")
                 with gr.Accordion(label="Vocal and Instrumental Result", open=False):