YuE-music-generator-demo-zero

Paused

App Files Files Community

KingNish commited on Feb 1

Commit

0be5f10

verified ·

1 Parent(s): c874206

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -71

app.py CHANGED Viewed

@@ -67,18 +67,19 @@ import time
 import copy
 from collections import Counter
 from models.soundstream_hubert_new import SoundStream
-#from vocoder import build_codec_model, process_audio # removed vocoder
-#from post_process_audio import replace_low_freq_with_energy_matched # removed post process
 device = "cuda:0"
 model = AutoModelForCausalLM.from_pretrained(
-    "m-a-p/YuE-s1-7B-anneal-en-icl", # "m-a-p/YuE-s1-7B-anneal-en-cot",
     torch_dtype=torch.float16,
     attn_implementation="flash_attention_2",
-    low_cpu_mem_usage=True,
 ).to(device)
 model.eval()
 basic_model_config = './xcodec_mini_infer/final_ckpt/config.yaml'
 resume_path = './xcodec_mini_infer/final_ckpt/ckpt_00360000.pth'
@@ -92,9 +93,61 @@ codec_model = eval(model_config.generator.name)(**model_config.generator.config)
 parameter_dict = torch.load(resume_path, map_location='cpu')
 codec_model.load_state_dict(parameter_dict['codec_model'])
 codec_model.eval()
 @spaces.GPU(duration=120)
 def generate_music(
         max_new_tokens=5,
         run_n_segments=2,
@@ -107,6 +160,11 @@ def generate_music(
         cuda_idx=0,
         rescale=False,
 ):
     if use_audio_prompt and not audio_prompt_path:
         raise FileNotFoundError("Please offer audio prompt filepath using '--audio_prompt_path', when you enable 'use_audio_prompt'!")
     cuda_idx = cuda_idx
@@ -116,31 +174,7 @@ def generate_music(
         stage1_output_dir = os.path.join(output_dir, f"stage1")
         os.makedirs(stage1_output_dir, exist_ok=True)
-        class BlockTokenRangeProcessor(LogitsProcessor):
-            def __init__(self, start_id, end_id):
-                self.blocked_token_ids = list(range(start_id, end_id))
-            def __call__(self, input_ids, scores):
-                scores[:, self.blocked_token_ids] = -float("inf")
-                return scores
-        def load_audio_mono(filepath, sampling_rate=16000):
-            audio, sr = torchaudio.load(filepath)
-            # Convert to mono
-            audio = torch.mean(audio, dim=0, keepdim=True)
-            # Resample if needed
-            if sr != sampling_rate:
-                resampler = Resample(orig_freq=sr, new_freq=sampling_rate)
-                audio = resampler(audio)
-            return audio
-        def split_lyrics(lyrics: str):
-            pattern = r"\[(\w+)\](.*?)\n(?=\[|\Z)"
-            segments = re.findall(pattern, lyrics, re.DOTALL)
-            structured_lyrics = [f"[{seg[0]}]\n{seg[1].strip()}\n\n" for seg in segments]
-            return structured_lyrics
-        # Call the function and print the result
         stage1_output_set = []
         genres = genre_txt.strip()
@@ -151,16 +185,15 @@ def generate_music(
         prompt_texts += lyrics
         random_id = uuid.uuid4()
-        output_seq = None
-        # Here is suggested decoding config
         top_p = 0.93
         temperature = 1.0
         repetition_penalty = 1.2
-        # special tokens
         start_of_segment = mmtokenizer.tokenize('[start_of_segment]')
         end_of_segment = mmtokenizer.tokenize('[end_of_segment]')
-        raw_output = None
         # Format text prompt
         run_n_segments = min(run_n_segments + 1, len(lyrics))
@@ -169,7 +202,7 @@ def generate_music(
         for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
             section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
-            guidance_scale = 1.5 if i <= 1 else 1.2
             if i == 0:
                 continue
             if i == 1:
@@ -196,30 +229,17 @@ def generate_music(
             prompt_ids = torch.as_tensor(prompt_ids).unsqueeze(0).to(device)
             input_ids = torch.cat([raw_output, prompt_ids], dim=1) if i > 1 else prompt_ids
             # Use window slicing in case output sequence exceeds the context of model
             max_context = 16384 - max_new_tokens - 1
             if input_ids.shape[-1] > max_context:
                 print(
                     f'Section {i}: output length {input_ids.shape[-1]} exceeding context length {max_context}, now using the last {max_context} tokens.')
                 input_ids = input_ids[:, -(max_context):]
-            with torch.inference_mode(), torch.autocast(device_type='cuda', dtype=torch.float16):
-                output_seq = model.generate(
-                    input_ids=input_ids,
-                    max_new_tokens=max_new_tokens,
-                    min_new_tokens=100,
-                    do_sample=True,
-                    top_p=top_p,
-                    temperature=temperature,
-                    repetition_penalty=repetition_penalty,
-                    eos_token_id=mmtokenizer.eoa,
-                    pad_token_id=mmtokenizer.eoa,
-                    logits_processor=LogitsProcessorList([BlockTokenRangeProcessor(0, 32002), BlockTokenRangeProcessor(32016, 32016)]),
-                    guidance_scale=guidance_scale,
-                    use_cache=True
-                )
-                if output_seq[0][-1].item() != mmtokenizer.eoa:
-                    tensor_eoa = torch.as_tensor([[mmtokenizer.eoa]]).to(model.device)
-                    output_seq = torch.cat((output_seq, tensor_eoa), dim=1)
             if i > 1:
                 raw_output = torch.cat([raw_output, prompt_ids, output_seq[:, input_ids.shape[-1]:]], dim=1)
             else:
@@ -240,7 +260,7 @@ def generate_music(
             codec_ids = ids[soa_idx[i] + 1:eoa_idx[i]]
             if codec_ids[0] == 32016:
                 codec_ids = codec_ids[1:]
-            codec_ids = codec_ids[:2 * (codec_ids.shape[0] // 2)]
             vocals_ids = codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[0])
             vocals.append(vocals_ids)
             instrumentals_ids = codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[1])
@@ -282,7 +302,7 @@ def generate_music(
             decoded_waveform = decoded_waveform.cpu().squeeze(0)
             decodec_rlt.append(torch.as_tensor(decoded_waveform))
             decodec_rlt = torch.cat(decodec_rlt, dim=-1)
-            save_path = os.path.join(recons_output_dir, os.path.splitext(os.path.basename(npy))[0] + ".mp3")
             tracks.append(save_path)
             save_audio(decodec_rlt, save_path, 16000)
         # mix tracks
@@ -306,7 +326,11 @@ def generate_music(
 def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=15):
-    # Execute the command
     try:
         mixed_audio_data, vocal_audio_data, instrumental_audio_data = generate_music(genre_txt=genre_txt_content, lyrics_txt=lyrics_txt_content, run_n_segments=num_segments,
                                cuda_idx=0, max_new_tokens=max_new_tokens)
@@ -315,10 +339,10 @@ def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=
         gr.Warning("An Error Occured: " + str(e))
         return None, None, None
     finally:
-        print("Temporary files deleted.")
-# Gradio
 with gr.Blocks() as demo:
     with gr.Column():
         gr.Markdown("# YuE: Open Music Foundation Models for Full-Song Generation")
@@ -352,19 +376,6 @@ with gr.Blocks() as demo:
         gr.Examples(
             examples=[
-#                 ["Rap-Rock Hybrid Punk basslines Scream-rap fusion Crowd chant vocals Distorted turntable scratches Rebel male vocal",
-#                     """[verse]
-# I'm the glitch in the algorithm's perfect face
-# Spit code red in 8-bit, corrupt the marketplace
-# Leather jacket pixels in a digital storm
-# Got meme knives that go viral, keep the normies warm
-# [chorus]
-# BREAK-CORE! (Break-core!)
-# Code-slicin' through the mainframe's bore
-# FAKE WAR! (Fake war!)
-# Trend-detonate, I'm the feedback roar
-#                     """],
                 [
                     "rap piano street tough piercing vocal hip-hop synthesizer clear vocal male",
                     """[verse]
@@ -415,5 +426,5 @@ Locked inside my mind, hot flame.
         outputs=[music_out, vocal_out, instrumental_out]
     )
     gr.Markdown("## Call for Contributions\nIf you find this space interesting please feel free to contribute.")
 demo.queue().launch(show_error=True)

 import copy
 from collections import Counter
 from models.soundstream_hubert_new import SoundStream
 device = "cuda:0"
+# Load model and tokenizer outside the generation function (load once)
+print("Loading model...")
 model = AutoModelForCausalLM.from_pretrained(
+    "m-a-p/YuE-s1-7B-anneal-en-cot", # "m-a-p/YuE-s1-7B-anneal-en-icl",
     torch_dtype=torch.float16,
     attn_implementation="flash_attention_2",
 ).to(device)
 model.eval()
+print("Model loaded.")
 basic_model_config = './xcodec_mini_infer/final_ckpt/config.yaml'
 resume_path = './xcodec_mini_infer/final_ckpt/ckpt_00360000.pth'
 parameter_dict = torch.load(resume_path, map_location='cpu')
 codec_model.load_state_dict(parameter_dict['codec_model'])
 codec_model.eval()
+print("Codec model loaded.")
+class BlockTokenRangeProcessor(LogitsProcessor):
+    def __init__(self, start_id, end_id):
+        self.blocked_token_ids = list(range(start_id, end_id))
+    def __call__(self, input_ids, scores):
+        scores[:, self.blocked_token_ids] = -float("inf")
+        return scores
+def load_audio_mono(filepath, sampling_rate=16000):
+    audio, sr = torchaudio.load(filepath)
+    # Convert to mono
+    audio = torch.mean(audio, dim=0, keepdim=True)
+    # Resample if needed
+    if sr != sampling_rate:
+        resampler = Resample(orig_freq=sr, new_freq=sampling_rate)
+        audio = resampler(audio)
+    return audio
+def split_lyrics(lyrics: str):
+    pattern = r"\[(\w+)\](.*?)\n(?=\[|\Z)"
+    segments = re.findall(pattern, lyrics, re.DOTALL)
+    structured_lyrics = [f"[{seg[0]}]\n{seg[1].strip()}\n\n" for seg in segments]
+    return structured_lyrics
 @spaces.GPU(duration=120)
+def model_inference(input_ids, max_new_tokens, top_p, temperature, repetition_penalty, guidance_scale):
+    """
+    Performs model inference to generate music tokens.
+    This function is decorated with @spaces.GPU for GPU usage in Gradio Spaces.
+    """
+    with torch.inference_mode(), torch.autocast(device_type='cuda', dtype=torch.float16):
+        output_seq = model.generate(
+            input_ids=input_ids,
+            max_new_tokens=max_new_tokens,
+            min_new_tokens=100, # Keep min_new_tokens to avoid short generations
+            do_sample=True,
+            top_p=top_p,
+            temperature=temperature,
+            repetition_penalty=repetition_penalty,
+            eos_token_id=mmtokenizer.eoa,
+            pad_token_id=mmtokenizer.eoa,
+            logits_processor=LogitsProcessorList([BlockTokenRangeProcessor(0, 32002), BlockTokenRangeProcessor(32016, 32016)]),
+            guidance_scale=guidance_scale,
+            use_cache=True
+        )
+        if output_seq[0][-1].item() != mmtokenizer.eoa:
+            tensor_eoa = torch.as_tensor([[mmtokenizer.eoa]]).to(model.device)
+            output_seq = torch.cat((output_seq, tensor_eoa), dim=1)
+    return output_seq
 def generate_music(
         max_new_tokens=5,
         run_n_segments=2,
         cuda_idx=0,
         rescale=False,
 ):
+    """
+    Generates music based on given genre and lyrics, optionally using an audio prompt.
+    This function orchestrates the music generation process, including prompt formatting,
+    model inference, and audio post-processing.
+    """
     if use_audio_prompt and not audio_prompt_path:
         raise FileNotFoundError("Please offer audio prompt filepath using '--audio_prompt_path', when you enable 'use_audio_prompt'!")
     cuda_idx = cuda_idx
         stage1_output_dir = os.path.join(output_dir, f"stage1")
         os.makedirs(stage1_output_dir, exist_ok=True)
         stage1_output_set = []
         genres = genre_txt.strip()
         prompt_texts += lyrics
         random_id = uuid.uuid4()
+        raw_output = None
+        # Decoding config (moved here for better readability)
         top_p = 0.93
         temperature = 1.0
         repetition_penalty = 1.2
         start_of_segment = mmtokenizer.tokenize('[start_of_segment]')
         end_of_segment = mmtokenizer.tokenize('[end_of_segment]')
         # Format text prompt
         run_n_segments = min(run_n_segments + 1, len(lyrics))
         for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
             section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
+            guidance_scale = 1.5 if i <= 1 else 1.2 # Guidance scale adjusted based on segment index
             if i == 0:
                 continue
             if i == 1:
             prompt_ids = torch.as_tensor(prompt_ids).unsqueeze(0).to(device)
             input_ids = torch.cat([raw_output, prompt_ids], dim=1) if i > 1 else prompt_ids
             # Use window slicing in case output sequence exceeds the context of model
             max_context = 16384 - max_new_tokens - 1
             if input_ids.shape[-1] > max_context:
                 print(
                     f'Section {i}: output length {input_ids.shape[-1]} exceeding context length {max_context}, now using the last {max_context} tokens.')
                 input_ids = input_ids[:, -(max_context):]
+            output_seq = model_inference(input_ids, max_new_tokens, top_p, temperature, repetition_penalty, guidance_scale)
             if i > 1:
                 raw_output = torch.cat([raw_output, prompt_ids, output_seq[:, input_ids.shape[-1]:]], dim=1)
             else:
             codec_ids = ids[soa_idx[i] + 1:eoa_idx[i]]
             if codec_ids[0] == 32016:
                 codec_ids = codec_ids[1:]
+            codec_ids = codec_ids[:2 * (codec_ids.shape[0] // 2)] # Ensure even length for reshape
             vocals_ids = codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[0])
             vocals.append(vocals_ids)
             instrumentals_ids = codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[1])
             decoded_waveform = decoded_waveform.cpu().squeeze(0)
             decodec_rlt.append(torch.as_tensor(decoded_waveform))
             decodec_rlt = torch.cat(decodec_rlt, dim=-1)
+            save_path = os.path.join(recons_output_dir, os.path.splitext(os.path.basename(npy))[0] + ".mp3") # Save as mp3 for gradio
             tracks.append(save_path)
             save_audio(decodec_rlt, save_path, 16000)
         # mix tracks
 def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=15):
+    """
+    Gradio interface function to generate music.
+    This function takes genre, lyrics, and generation parameters from Gradio inputs,
+    calls the music generation pipeline, and returns the audio outputs.
+    """
     try:
         mixed_audio_data, vocal_audio_data, instrumental_audio_data = generate_music(genre_txt=genre_txt_content, lyrics_txt=lyrics_txt_content, run_n_segments=num_segments,
                                cuda_idx=0, max_new_tokens=max_new_tokens)
         gr.Warning("An Error Occured: " + str(e))
         return None, None, None
     finally:
+        print("Temporary files deleted.") # This message is printed regardless of success/failure
+# Gradio Interface
 with gr.Blocks() as demo:
     with gr.Column():
         gr.Markdown("# YuE: Open Music Foundation Models for Full-Song Generation")
         gr.Examples(
             examples=[
                 [
                     "rap piano street tough piercing vocal hip-hop synthesizer clear vocal male",
                     """[verse]
         outputs=[music_out, vocal_out, instrumental_out]
     )
     gr.Markdown("## Call for Contributions\nIf you find this space interesting please feel free to contribute.")
 demo.queue().launch(show_error=True)