YuE-music-generator-demo-zero

Paused

App Files Files Community

KingNish commited on Feb 2

Commit

b0cba66

verified ·

1 Parent(s): 12f1bb2

Update app.py

Browse files

Files changed (1) hide show

app.py +179 -170

app.py CHANGED Viewed

@@ -1,9 +1,8 @@
 import gradio as gr
 import subprocess
 import os
-import shutil
-import tempfile
 import spaces
 import torch
 import sys
 import uuid
@@ -19,10 +18,8 @@ subprocess.run(
 from huggingface_hub import snapshot_download
-# Create xcodec_mini_infer folder
 folder_path = './xcodec_mini_infer'
-# Create the folder if it doesn't exist
 if not os.path.exists(folder_path):
     os.mkdir(folder_path)
     print(f"Folder created at: {folder_path}")
@@ -34,7 +31,7 @@ snapshot_download(
     local_dir="./xcodec_mini_infer"
 )
-# Change to the "inference" directory
 inference_dir = "."
 try:
     os.chdir(inference_dir)
@@ -46,16 +43,13 @@ except FileNotFoundError:
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer'))
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer', 'descriptaudiocodec'))
-# don't change above code
-import argparse
 import numpy as np
 import json
 from omegaconf import OmegaConf
 import torchaudio
 from torchaudio.transforms import Resample
 import soundfile as sf
 from tqdm import tqdm
 from einops import rearrange
 from codecmanipulator import CodecManipulator
@@ -67,12 +61,14 @@ import copy
 from collections import Counter
 from models.soundstream_hubert_new import SoundStream
 device = "cuda:0"
-# Load model and tokenizer outside the generation function (load once)
 print("Loading model...")
 model = AutoModelForCausalLM.from_pretrained(
-    "m-a-p/YuE-s1-7B-anneal-en-cot",  # "m-a-p/YuE-s1-7B-anneal-en-icl",
     torch_dtype=torch.float16,
     attn_implementation="flash_attention_2",
 ).to(device)
@@ -83,9 +79,9 @@ basic_model_config = './xcodec_mini_infer/final_ckpt/config.yaml'
 resume_path = './xcodec_mini_infer/final_ckpt/ckpt_00360000.pth'
 mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
 codectool = CodecManipulator("xcodec", 0, 1)
 model_config = OmegaConf.load(basic_model_config)
 # Load codec model
 codec_model = eval(model_config.generator.name)(**model_config.generator.config).to(device)
 parameter_dict = torch.load(resume_path, map_location='cpu')
@@ -93,7 +89,9 @@ codec_model.load_state_dict(parameter_dict['codec_model'])
 codec_model.eval()
 print("Codec model loaded.")
 class BlockTokenRangeProcessor(LogitsProcessor):
     def __init__(self, start_id, end_id):
         self.blocked_token_ids = list(range(start_id, end_id))
@@ -118,17 +116,19 @@ def split_lyrics(lyrics: str):
     structured_lyrics = [f"[{seg[0]}]\n{seg[1].strip()}\n\n" for seg in segments]
     return structured_lyrics
 @spaces.GPU(duration=175)
-def requires_cuda(input_ids, max_new_tokens, top_p, temperature, repetition_penalty, guidance_scale):
     """
-    This function wraps the heavy GPU inference that uses torch.autocast and torch.inference_mode.
-    It calls model.generate with the appropriate parameters and returns the generated sequence.
     """
     with torch.inference_mode(), torch.autocast(device_type='cuda', dtype=torch.float16):
         output_seq = model.generate(
             input_ids=input_ids,
             max_new_tokens=max_new_tokens,
-            min_new_tokens=100,  # Keep min_new_tokens to avoid short generations
             do_sample=True,
             top_p=top_p,
             temperature=temperature,
@@ -142,12 +142,39 @@ def requires_cuda(input_ids, max_new_tokens, top_p, temperature, repetition_pena
             guidance_scale=guidance_scale,
             use_cache=True
         )
-        # If the output does not end with the EOS token, append it.
         if output_seq[0][-1].item() != mmtokenizer.eoa:
             tensor_eoa = torch.as_tensor([[mmtokenizer.eoa]]).to(model.device)
             output_seq = torch.cat((output_seq, tensor_eoa), dim=1)
     return output_seq
 def generate_music(
         genre_txt=None,
         lyrics_txt=None,
@@ -161,163 +188,147 @@ def generate_music(
         rescale=False,
 ):
     """
-    Generates music based on given genre and lyrics, optionally using an audio prompt.
-    This function orchestrates the music generation process, including prompt formatting,
-    model inference, and audio post-processing.
     """
     if use_audio_prompt and not audio_prompt_path:
         raise FileNotFoundError("Please provide an audio prompt file when 'Use Audio Prompt' is enabled!")
-    cuda_idx = cuda_idx
     max_new_tokens = max_new_tokens * 100
-    with tempfile.TemporaryDirectory() as output_dir:
-        stage1_output_dir = os.path.join(output_dir, f"stage1")
-        os.makedirs(stage1_output_dir, exist_ok=True)
-        stage1_output_set = []
-        genres = genre_txt.strip()
-        lyrics = split_lyrics(lyrics_txt + "\n")
-        # instruction
-        full_lyrics = "\n".join(lyrics)
-        prompt_texts = [f"Generate music from the given lyrics segment by segment.\n[Genre] {genres}\n{full_lyrics}"]
-        prompt_texts += lyrics
-        random_id = uuid.uuid4()
-        raw_output = None
-        # Decoding config
-        top_p = 0.93
-        temperature = 1.0
-        repetition_penalty = 1.2
-        start_of_segment = mmtokenizer.tokenize('[start_of_segment]')
-        end_of_segment = mmtokenizer.tokenize('[end_of_segment]')
-        # Format text prompt
-        run_n_segments = min(run_n_segments + 1, len(lyrics))
-        print(list(enumerate(tqdm(prompt_texts[:run_n_segments]))))
-        for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
-            section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
-            guidance_scale = 1.5 if i <= 1 else 1.2  # Adjust guidance scale per segment
-            if i == 0:
-                continue
-            if i == 1:
-                if use_audio_prompt:
-                    audio_prompt = load_audio_mono(audio_prompt_path)
-                    audio_prompt.unsqueeze_(0)
-                    with torch.no_grad():
-                        raw_codes = codec_model.encode(audio_prompt.to(device), target_bw=0.5)
-                    raw_codes = raw_codes.transpose(0, 1)
-                    raw_codes = raw_codes.cpu().numpy().astype(np.int16)
-                    code_ids = codectool.npy2ids(raw_codes[0])
-                    audio_prompt_codec = code_ids[int(prompt_start_time * 50): int(prompt_end_time * 50)]
-                    audio_prompt_codec_ids = [mmtokenizer.soa] + codectool.sep_ids + audio_prompt_codec + [mmtokenizer.eoa]
-                    sentence_ids = mmtokenizer.tokenize("[start_of_reference]") + audio_prompt_codec_ids + mmtokenizer.tokenize("[end_of_reference]")
-                    head_id = mmtokenizer.tokenize(prompt_texts[0]) + sentence_ids
-                else:
-                    head_id = mmtokenizer.tokenize(prompt_texts[0])
-                prompt_ids = head_id + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
             else:
-                prompt_ids = end_of_segment + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
-            prompt_ids = torch.as_tensor(prompt_ids).unsqueeze(0).to(device)
-            input_ids = torch.cat([raw_output, prompt_ids], dim=1) if i > 1 else prompt_ids
-            # Window slicing in case the sequence exceeds the model's context length
-            max_context = 16384 - max_new_tokens - 1
-            if input_ids.shape[-1] > max_context:
-                print(
-                    f'Section {i}: output length {input_ids.shape[-1]} exceeding context length {max_context}, now using the last {max_context} tokens.')
-                input_ids = input_ids[:, -(max_context):]
-            # Perform the GPU-heavy inference using the requires_cuda function.
-            output_seq = requires_cuda(input_ids, max_new_tokens, top_p, temperature, repetition_penalty, guidance_scale)
-            if i > 1:
-                raw_output = torch.cat([raw_output, prompt_ids, output_seq[:, input_ids.shape[-1]:]], dim=1)
-            else:
-                raw_output = output_seq
-            print(len(raw_output))
-        # save raw output and check sanity
-        ids = raw_output[0].cpu().numpy()
-        soa_idx = np.where(ids == mmtokenizer.soa)[0].tolist()
-        eoa_idx = np.where(ids == mmtokenizer.eoa)[0].tolist()
-        if len(soa_idx) != len(eoa_idx):
-            raise ValueError(f'invalid pairs of soa and eoa, Num of soa: {len(soa_idx)}, Num of eoa: {len(eoa_idx)}')
-        vocals = []
-        instrumentals = []
-        range_begin = 1 if use_audio_prompt else 0
-        for i in range(range_begin, len(soa_idx)):
-            codec_ids = ids[soa_idx[i] + 1:eoa_idx[i]]
-            if codec_ids[0] == 32016:
-                codec_ids = codec_ids[1:]
-            codec_ids = codec_ids[:2 * (len(codec_ids) // 2)]  # Ensure even length for reshape
-            vocals_ids = codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[0])
-            vocals.append(vocals_ids)
-            instrumentals_ids = codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[1])
-            instrumentals.append(instrumentals_ids)
-        vocals = np.concatenate(vocals, axis=1)
-        instrumentals = np.concatenate(instrumentals, axis=1)
-        vocal_save_path = os.path.join(stage1_output_dir, f"vocal_{random_id}".replace('.', '@') + '.npy')
-        inst_save_path = os.path.join(stage1_output_dir, f"instrumental_{random_id}".replace('.', '@') + '.npy')
-        np.save(vocal_save_path, vocals)
-        np.save(inst_save_path, instrumentals)
-        stage1_output_set.append(vocal_save_path)
-        stage1_output_set.append(inst_save_path)
-        print("Converting to Audio...")
-        # convert audio tokens to audio
-        def save_audio(wav: torch.Tensor, path, sample_rate: int, rescale: bool = False):
-            folder_path = os.path.dirname(path)
-            if not os.path.exists(folder_path):
-                os.makedirs(folder_path)
-            limit = 0.99
-            max_val = wav.abs().max()
-            wav = wav * min(limit / max_val, 1) if rescale else wav.clamp(-limit, limit)
-            torchaudio.save(str(path), wav, sample_rate=sample_rate, encoding='PCM_S', bits_per_sample=16)
-        # reconstruct tracks
-        recons_output_dir = os.path.join(output_dir, "recons")
-        recons_mix_dir = os.path.join(recons_output_dir, 'mix')
-        os.makedirs(recons_mix_dir, exist_ok=True)
-        tracks = []
-        for npy in stage1_output_set:
-            codec_result = np.load(npy)
-            decodec_rlt = []
-            with torch.no_grad():
-                decoded_waveform = codec_model.decode(
-                    torch.as_tensor(codec_result.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(device))
-            decoded_waveform = decoded_waveform.cpu().squeeze(0)
-            decodec_rlt.append(torch.as_tensor(decoded_waveform))
-            decodec_rlt = torch.cat(decodec_rlt, dim=-1)
-            save_path = os.path.join(recons_output_dir, os.path.splitext(os.path.basename(npy))[0] + ".mp3")  # Save as mp3 for gradio
-            tracks.append(save_path)
-            save_audio(decodec_rlt, save_path, 16000)
-        # mix tracks
-        for inst_path in tracks:
-            try:
-                if (inst_path.endswith('.wav') or inst_path.endswith('.mp3')) and 'instrumental' in inst_path:
-                    # find pair
-                    vocal_path = inst_path.replace('instrumental', 'vocal')
-                    if not os.path.exists(vocal_path):
-                        continue
-                    # mix
-                    recons_mix = os.path.join(recons_mix_dir, os.path.basename(inst_path).replace('instrumental', 'mixed'))
-                    vocal_stem, sr = sf.read(vocal_path)
-                    instrumental_stem, _ = sf.read(inst_path)
-                    mix_stem = (vocal_stem + instrumental_stem) / 1
-                    return (sr, (mix_stem * 32767).astype(np.int16)), (sr, (vocal_stem * 32767).astype(np.int16)), (sr, (instrumental_stem * 32767).astype(np.int16))
-            except Exception as e:
-                print(e)
-                return None, None, None
 # Gradio Interface
 with gr.Blocks() as demo:
     with gr.Column():
         gr.Markdown("# YuE: Open Music Foundation Models for Full-Song Generation")
@@ -350,7 +361,6 @@ with gr.Blocks() as demo:
                     instrumental_out = gr.Audio(label="Instrumental Audio")
         gr.Markdown("## Call for Contributions\nIf you find this space interesting please feel free to contribute.")
-        # When the "Submit" button is clicked, pass the additional audio-related inputs to the function.
         submit_btn.click(
             fn=generate_music,
             inputs=[
@@ -364,7 +374,6 @@ with gr.Blocks() as demo:
             outputs=[music_out, vocal_out, instrumental_out]
         )
-        # Examples updated to only include text inputs
         gr.Examples(
             examples=[
                 [

 import gradio as gr
 import subprocess
 import os
 import spaces
+import shutil
 import torch
 import sys
 import uuid
 from huggingface_hub import snapshot_download
+# Create xcodec_mini_infer folder if it does not exist
 folder_path = './xcodec_mini_infer'
 if not os.path.exists(folder_path):
     os.mkdir(folder_path)
     print(f"Folder created at: {folder_path}")
     local_dir="./xcodec_mini_infer"
 )
+# Change working directory if needed
 inference_dir = "."
 try:
     os.chdir(inference_dir)
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer'))
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer', 'descriptaudiocodec'))
 import numpy as np
 import json
+import argparse
 from omegaconf import OmegaConf
 import torchaudio
 from torchaudio.transforms import Resample
 import soundfile as sf
 from tqdm import tqdm
 from einops import rearrange
 from codecmanipulator import CodecManipulator
 from collections import Counter
 from models.soundstream_hubert_new import SoundStream
+# ---------------------------------------------------------------------
+# Load models, configurations, and tokenizers (run once at startup)
+# ---------------------------------------------------------------------
 device = "cuda:0"
 print("Loading model...")
 model = AutoModelForCausalLM.from_pretrained(
+    "m-a-p/YuE-s1-7B-anneal-en-cot",
     torch_dtype=torch.float16,
     attn_implementation="flash_attention_2",
 ).to(device)
 resume_path = './xcodec_mini_infer/final_ckpt/ckpt_00360000.pth'
 mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
 codectool = CodecManipulator("xcodec", 0, 1)
 model_config = OmegaConf.load(basic_model_config)
 # Load codec model
 codec_model = eval(model_config.generator.name)(**model_config.generator.config).to(device)
 parameter_dict = torch.load(resume_path, map_location='cpu')
 codec_model.eval()
 print("Codec model loaded.")
+# ---------------------------------------------------------------------
+# Helper Classes and Functions
+# ---------------------------------------------------------------------
 class BlockTokenRangeProcessor(LogitsProcessor):
     def __init__(self, start_id, end_id):
         self.blocked_token_ids = list(range(start_id, end_id))
     structured_lyrics = [f"[{seg[0]}]\n{seg[1].strip()}\n\n" for seg in segments]
     return structured_lyrics
+# ---------------------------
+# CUDA Heavy Functions
+# ---------------------------
 @spaces.GPU(duration=175)
+def requires_cuda_generation(input_ids, max_new_tokens, top_p, temperature, repetition_penalty, guidance_scale):
     """
+    Performs the CUDA-intensive generation using the language model.
     """
     with torch.inference_mode(), torch.autocast(device_type='cuda', dtype=torch.float16):
         output_seq = model.generate(
             input_ids=input_ids,
             max_new_tokens=max_new_tokens,
+            min_new_tokens=100,  # To avoid too-short generations
             do_sample=True,
             top_p=top_p,
             temperature=temperature,
             guidance_scale=guidance_scale,
             use_cache=True
         )
+        # If the generated sequence does not end with the end-of-audio token, append it.
         if output_seq[0][-1].item() != mmtokenizer.eoa:
             tensor_eoa = torch.as_tensor([[mmtokenizer.eoa]]).to(model.device)
             output_seq = torch.cat((output_seq, tensor_eoa), dim=1)
     return output_seq
+@spaces.GPU(duration=15)
+def requires_cuda_decode(codec_result):
+    """
+    Uses the codec model on the GPU to decode a given numpy array of codec IDs
+    into a waveform tensor.
+    """
+    with torch.no_grad():
+        # Convert the numpy result to tensor and move to device
+        codec_tensor = torch.as_tensor(codec_result.astype(np.int16), dtype=torch.long)
+        # The expected shape is (seq_len, batch, channels), so we add and permute dims as needed.
+        codec_tensor = codec_tensor.unsqueeze(0).permute(1, 0, 2).to(device)
+        decoded_waveform = codec_model.decode(codec_tensor)
+    return decoded_waveform.cpu().squeeze(0)
+def save_audio(wav: torch.Tensor, sample_rate: int, rescale: bool = False):
+    """
+    Convert a waveform tensor to a numpy array (16-bit PCM) without writing to disk.
+    """
+    limit = 0.99
+    max_val = wav.abs().max()
+    wav = wav * min(limit / max_val, 1) if rescale else wav.clamp(-limit, limit)
+    # Return a tuple as expected by Gradio: (sample_rate, np.array)
+    return sample_rate, (wav.numpy() * 32767).astype(np.int16)
+# ---------------------------------------------------------------------
+# Main Generation Function (without temporary files/directories)
+# ---------------------------------------------------------------------
 def generate_music(
         genre_txt=None,
         lyrics_txt=None,
         rescale=False,
 ):
     """
+    Generates music based on genre and lyrics (and optionally an audio prompt).
+    The heavy CUDA computations are performed in helper functions.
+    All intermediate data is kept in memory.
     """
     if use_audio_prompt and not audio_prompt_path:
         raise FileNotFoundError("Please provide an audio prompt file when 'Use Audio Prompt' is enabled!")
+    # Scale max_new_tokens (e.g. each token may correspond to 100 time units)
     max_new_tokens = max_new_tokens * 100
+    # Prepare prompt texts from genre and lyrics
+    genres = genre_txt.strip()
+    lyrics_segments = split_lyrics(lyrics_txt + "\n")
+    full_lyrics = "\n".join(lyrics_segments)
+    # The first prompt is the overall instruction and full lyrics.
+    prompt_texts = [f"Generate music from the given lyrics segment by segment.\n[Genre] {genres}\n{full_lyrics}"]
+    # Then add each individual lyric segment.
+    prompt_texts += lyrics_segments
+    random_id = uuid.uuid4()
+    raw_output = None
+    # Generation configuration
+    top_p = 0.93
+    temperature = 1.0
+    repetition_penalty = 1.2
+    start_of_segment = mmtokenizer.tokenize('[start_of_segment]')
+    end_of_segment = mmtokenizer.tokenize('[end_of_segment]')
+    # Limit the number of segments to generate (adding 1 because the first prompt is a header)
+    run_n_segments = min(run_n_segments + 1, len(prompt_texts))
+    print("Starting generation for segments:")
+    print(list(enumerate(tqdm(prompt_texts[:run_n_segments]))))
+    # Loop over each prompt segment
+    for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
+        section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
+        # Adjust guidance scale based on segment index
+        guidance_scale = 1.5 if i <= 1 else 1.2
+        # For the header prompt, we just use the tokenized text.
+        if i == 0:
+            continue
+        if i == 1:
+            # Process audio prompt if provided
+            if use_audio_prompt:
+                audio_prompt = load_audio_mono(audio_prompt_path)
+                audio_prompt = audio_prompt.unsqueeze(0)
+                with torch.no_grad():
+                    raw_codes = codec_model.encode(audio_prompt.to(device), target_bw=0.5)
+                raw_codes = raw_codes.transpose(0, 1)
+                raw_codes = raw_codes.cpu().numpy().astype(np.int16)
+                code_ids = codectool.npy2ids(raw_codes[0])
+                # Select a slice corresponding to the provided time range.
+                audio_prompt_codec = code_ids[int(prompt_start_time * 50): int(prompt_end_time * 50)]
+                audio_prompt_codec_ids = [mmtokenizer.soa] + codectool.sep_ids + audio_prompt_codec + [mmtokenizer.eoa]
+                sentence_ids = mmtokenizer.tokenize("[start_of_reference]") + audio_prompt_codec_ids + mmtokenizer.tokenize("[end_of_reference]")
+                head_id = mmtokenizer.tokenize(prompt_texts[0]) + sentence_ids
             else:
+                head_id = mmtokenizer.tokenize(prompt_texts[0])
+            prompt_ids = head_id + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
+        else:
+            prompt_ids = end_of_segment + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
+        # Convert prompt tokens to tensor and move to device
+        prompt_ids = torch.as_tensor(prompt_ids).unsqueeze(0).to(device)
+        input_ids = torch.cat([raw_output, prompt_ids], dim=1) if (i > 1 and raw_output is not None) else prompt_ids
+        # Ensure input length does not exceed model context window (using last tokens if needed)
+        max_context = 16384 - max_new_tokens - 1
+        if input_ids.shape[-1] > max_context:
+            print(
+                f'Section {i}: input length {input_ids.shape[-1]} exceeds context length {max_context}. Using last {max_context} tokens.'
+            )
+            input_ids = input_ids[:, -max_context:]
+        # Generate new tokens using the CUDA-heavy helper function
+        output_seq = requires_cuda_generation(
+            input_ids,
+            max_new_tokens,
+            top_p,
+            temperature,
+            repetition_penalty,
+            guidance_scale
+        )
+        # Accumulate outputs across segments
+        if i > 1:
+            raw_output = torch.cat([raw_output, prompt_ids, output_seq[:, input_ids.shape[-1]:]], dim=1)
+        else:
+            raw_output = output_seq
+        print(f"Accumulated output length: {raw_output.shape[-1]} tokens")
+    # After generation, convert raw output tokens into codec IDs.
+    ids = raw_output[0].cpu().numpy()
+    soa_idx = np.where(ids == mmtokenizer.soa)[0].tolist()
+    eoa_idx = np.where(ids == mmtokenizer.eoa)[0].tolist()
+    if len(soa_idx) != len(eoa_idx):
+        raise ValueError(f"Invalid pairs of soa and eoa: Num of soa: {len(soa_idx)}, Num of eoa: {len(eoa_idx)}")
+    vocals_list = []
+    instrumentals_list = []
+    # If an audio prompt was used, skip the first pair.
+    range_begin = 1 if use_audio_prompt else 0
+    for i in range(range_begin, len(soa_idx)):
+        codec_ids = ids[soa_idx[i] + 1:eoa_idx[i]]
+        if codec_ids[0] == 32016:
+            codec_ids = codec_ids[1:]
+        # Ensure even length for reshaping into two tracks (vocal and instrumental)
+        codec_ids = codec_ids[:2 * (len(codec_ids) // 2)]
+        reshaped = rearrange(codec_ids, "(n b) -> b n", b=2)
+        vocals_ids = codectool.ids2npy(reshaped[0])
+        instrumentals_ids = codectool.ids2npy(reshaped[1])
+        vocals_list.append(vocals_ids)
+        instrumentals_list.append(instrumentals_ids)
+    # Concatenate segments in time dimension
+    vocals_codec = np.concatenate(vocals_list, axis=1)
+    instrumentals_codec = np.concatenate(instrumentals_list, axis=1)
+    print("Decoding audio on GPU...")
+    # Decode the codec arrays to waveforms using the CUDA helper function.
+    vocal_waveform = requires_cuda_decode(vocals_codec)
+    instrumental_waveform = requires_cuda_decode(instrumentals_codec)
+    # Mix the two waveforms (simple summation)
+    mixed_waveform = (vocal_waveform + instrumental_waveform) / 1.0
+    # Return the three audio outputs (mixed, vocal, instrumental) as tuples (sample_rate, np.array)
+    sample_rate = 16000
+    mixed_audio = save_audio(mixed_waveform, sample_rate, rescale)
+    vocal_audio = save_audio(vocal_waveform, sample_rate, rescale)
+    instrumental_audio = save_audio(instrumental_waveform, sample_rate, rescale)
+    return mixed_audio, vocal_audio, instrumental_audio
+# ---------------------------------------------------------------------
 # Gradio Interface
+# ---------------------------------------------------------------------
 with gr.Blocks() as demo:
     with gr.Column():
         gr.Markdown("# YuE: Open Music Foundation Models for Full-Song Generation")
                     instrumental_out = gr.Audio(label="Instrumental Audio")
         gr.Markdown("## Call for Contributions\nIf you find this space interesting please feel free to contribute.")
         submit_btn.click(
             fn=generate_music,
             inputs=[
             outputs=[music_out, vocal_out, instrumental_out]
         )
         gr.Examples(
             examples=[
                 [