YuE-music-generator-demo-zero

Paused

App Files Files Community

KingNish commited on Feb 2

Commit

d305eb7

verified ·

1 Parent(s): 7b1113e

Update app.py

Browse files

Files changed (1) hide show

app.py +189 -204

app.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import gradio as gr
 import subprocess
 import os
-import spaces
 import shutil
 import torch
 import sys
 import uuid
@@ -18,8 +19,10 @@ subprocess.run(
 from huggingface_hub import snapshot_download
-# Create xcodec_mini_infer folder if it does not exist
 folder_path = './xcodec_mini_infer'
 if not os.path.exists(folder_path):
     os.mkdir(folder_path)
     print(f"Folder created at: {folder_path}")
@@ -31,7 +34,7 @@ snapshot_download(
     local_dir="./xcodec_mini_infer"
 )
-# Change working directory if needed
 inference_dir = "."
 try:
     os.chdir(inference_dir)
@@ -43,13 +46,16 @@ except FileNotFoundError:
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer'))
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer', 'descriptaudiocodec'))
 import numpy as np
 import json
-import argparse
 from omegaconf import OmegaConf
 import torchaudio
 from torchaudio.transforms import Resample
 import soundfile as sf
 from tqdm import tqdm
 from einops import rearrange
 from codecmanipulator import CodecManipulator
@@ -61,14 +67,12 @@ import copy
 from collections import Counter
 from models.soundstream_hubert_new import SoundStream
-# ---------------------------------------------------------------------
-# Load models, configurations, and tokenizers (run once at startup)
-# ---------------------------------------------------------------------
 device = "cuda:0"
 print("Loading model...")
 model = AutoModelForCausalLM.from_pretrained(
-    "m-a-p/YuE-s1-7B-anneal-en-cot",
     torch_dtype=torch.float16,
     attn_implementation="flash_attention_2",
 ).to(device)
@@ -79,9 +83,9 @@ basic_model_config = './xcodec_mini_infer/final_ckpt/config.yaml'
 resume_path = './xcodec_mini_infer/final_ckpt/ckpt_00360000.pth'
 mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
 codectool = CodecManipulator("xcodec", 0, 1)
 model_config = OmegaConf.load(basic_model_config)
 # Load codec model
 codec_model = eval(model_config.generator.name)(**model_config.generator.config).to(device)
 parameter_dict = torch.load(resume_path, map_location='cpu')
@@ -89,9 +93,7 @@ codec_model.load_state_dict(parameter_dict['codec_model'])
 codec_model.eval()
 print("Codec model loaded.")
-# ---------------------------------------------------------------------
-# Helper Classes and Functions
-# ---------------------------------------------------------------------
 class BlockTokenRangeProcessor(LogitsProcessor):
     def __init__(self, start_id, end_id):
         self.blocked_token_ids = list(range(start_id, end_id))
@@ -116,69 +118,12 @@ def split_lyrics(lyrics: str):
     structured_lyrics = [f"[{seg[0]}]\n{seg[1].strip()}\n\n" for seg in segments]
     return structured_lyrics
-# ---------------------------
-# CUDA Heavy Functions
-# ---------------------------
-def requires_cuda_generation(input_ids, max_new_tokens, top_p, temperature, repetition_penalty, guidance_scale):
-    """
-    Performs the CUDA-intensive generation using the language model.
-    """
-    with torch.inference_mode(), torch.autocast(device_type='cuda', dtype=torch.float16):
-        output_seq = model.generate(
-            input_ids=input_ids,
-            max_new_tokens=max_new_tokens,
-            min_new_tokens=100,  # To avoid too-short generations
-            do_sample=True,
-            top_p=top_p,
-            temperature=temperature,
-            repetition_penalty=repetition_penalty,
-            eos_token_id=mmtokenizer.eoa,
-            pad_token_id=mmtokenizer.eoa,
-            logits_processor=LogitsProcessorList([
-                BlockTokenRangeProcessor(0, 32002),
-                BlockTokenRangeProcessor(32016, 32016)
-            ]),
-            guidance_scale=guidance_scale,
-            use_cache=True
-        )
-        # If the generated sequence does not end with the end-of-audio token, append it.
-        if output_seq[0][-1].item() != mmtokenizer.eoa:
-            tensor_eoa = torch.as_tensor([[mmtokenizer.eoa]]).to(model.device)
-            output_seq = torch.cat((output_seq, tensor_eoa), dim=1)
-    return output_seq
-def requires_cuda_decode(codec_result):
-    """
-    Uses the codec model on the GPU to decode a given numpy array of codec IDs
-    into a waveform tensor.
-    """
-    with torch.no_grad():
-        # Convert the numpy result to tensor and move to device
-        codec_tensor = torch.as_tensor(codec_result.astype(np.int16), dtype=torch.long)
-        # The expected shape is (seq_len, batch, channels), so we add and permute dims as needed.
-        codec_tensor = codec_tensor.unsqueeze(0).permute(1, 0, 2).to(device)
-        decoded_waveform = codec_model.decode(codec_tensor)
-    return decoded_waveform.cpu().squeeze(0)
-def save_audio(wav: torch.Tensor, sample_rate: int, rescale: bool = False):
-    """
-    Convert a waveform tensor to a numpy array (16-bit PCM) without writing to disk.
-    """
-    limit = 0.99
-    max_val = wav.abs().max()
-    wav = wav * min(limit / max_val, 1) if rescale else wav.clamp(-limit, limit)
-    # Return a tuple as expected by Gradio: (sample_rate, np.array)
-    return sample_rate, (wav.numpy() * 32767).astype(np.int16)
-# ---------------------------------------------------------------------
-# Main Generation Function (without temporary files/directories)
-# ---------------------------------------------------------------------
 @spaces.GPU(duration=175)
 def generate_music(
         genre_txt=None,
         lyrics_txt=None,
         run_n_segments=2,
-        max_new_tokens=23,
         use_audio_prompt=False,
         audio_prompt_path="",
         prompt_start_time=0.0,
@@ -187,147 +132,185 @@ def generate_music(
         rescale=False,
 ):
     """
-    Generates music based on genre and lyrics (and optionally an audio prompt).
-    The heavy CUDA computations are performed in helper functions.
-    All intermediate data is kept in memory.
     """
     if use_audio_prompt and not audio_prompt_path:
         raise FileNotFoundError("Please provide an audio prompt file when 'Use Audio Prompt' is enabled!")
-    # Scale max_new_tokens (e.g. each token may correspond to 100 time units)
     max_new_tokens = max_new_tokens * 100
-    # Prepare prompt texts from genre and lyrics
-    genres = genre_txt.strip()
-    lyrics_segments = split_lyrics(lyrics_txt + "\n")
-    full_lyrics = "\n".join(lyrics_segments)
-    # The first prompt is the overall instruction and full lyrics.
-    prompt_texts = [f"Generate music from the given lyrics segment by segment.\n[Genre] {genres}\n{full_lyrics}"]
-    # Then add each individual lyric segment.
-    prompt_texts += lyrics_segments
-    random_id = uuid.uuid4()
-    raw_output = None
-    # Generation configuration
-    top_p = 0.93
-    temperature = 1.0
-    repetition_penalty = 1.2
-    start_of_segment = mmtokenizer.tokenize('[start_of_segment]')
-    end_of_segment = mmtokenizer.tokenize('[end_of_segment]')
-    # Limit the number of segments to generate (adding 1 because the first prompt is a header)
-    run_n_segments = min(run_n_segments + 1, len(prompt_texts))
-    print("Starting generation for segments:")
-    print(list(enumerate(tqdm(prompt_texts[:run_n_segments]))))
-    # Loop over each prompt segment
-    for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
-        section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
-        # Adjust guidance scale based on segment index
-        guidance_scale = 1.5 if i <= 1 else 1.2
-        # For the header prompt, we just use the tokenized text.
-        if i == 0:
-            continue
-        if i == 1:
-            # Process audio prompt if provided
-            if use_audio_prompt:
-                audio_prompt = load_audio_mono(audio_prompt_path)
-                audio_prompt = audio_prompt.unsqueeze(0)
-                with torch.no_grad():
-                    raw_codes = codec_model.encode(audio_prompt.to(device), target_bw=0.5)
-                raw_codes = raw_codes.transpose(0, 1)
-                raw_codes = raw_codes.cpu().numpy().astype(np.int16)
-                code_ids = codectool.npy2ids(raw_codes[0])
-                # Select a slice corresponding to the provided time range.
-                audio_prompt_codec = code_ids[int(prompt_start_time * 50): int(prompt_end_time * 50)]
-                audio_prompt_codec_ids = [mmtokenizer.soa] + codectool.sep_ids + audio_prompt_codec + [mmtokenizer.eoa]
-                sentence_ids = mmtokenizer.tokenize("[start_of_reference]") + audio_prompt_codec_ids + mmtokenizer.tokenize("[end_of_reference]")
-                head_id = mmtokenizer.tokenize(prompt_texts[0]) + sentence_ids
             else:
-                head_id = mmtokenizer.tokenize(prompt_texts[0])
-            prompt_ids = head_id + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
-        else:
-            prompt_ids = end_of_segment + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
-        # Convert prompt tokens to tensor and move to device
-        prompt_ids = torch.as_tensor(prompt_ids).unsqueeze(0).to(device)
-        input_ids = torch.cat([raw_output, prompt_ids], dim=1) if (i > 1 and raw_output is not None) else prompt_ids
-        # Ensure input length does not exceed model context window (using last tokens if needed)
-        max_context = 16384 - max_new_tokens - 1
-        if input_ids.shape[-1] > max_context:
-            print(
-                f'Section {i}: input length {input_ids.shape[-1]} exceeds context length {max_context}. Using last {max_context} tokens.'
-            )
-            input_ids = input_ids[:, -max_context:]
-        # Generate new tokens using the CUDA-heavy helper function
-        output_seq = requires_cuda_generation(
-            input_ids,
-            max_new_tokens,
-            top_p,
-            temperature,
-            repetition_penalty,
-            guidance_scale
-        )
-        # Accumulate outputs across segments
-        if i > 1:
-            raw_output = torch.cat([raw_output, prompt_ids, output_seq[:, input_ids.shape[-1]:]], dim=1)
-        else:
-            raw_output = output_seq
-        print(f"Accumulated output length: {raw_output.shape[-1]} tokens")
-    # After generation, convert raw output tokens into codec IDs.
-    ids = raw_output[0].cpu().numpy()
-    soa_idx = np.where(ids == mmtokenizer.soa)[0].tolist()
-    eoa_idx = np.where(ids == mmtokenizer.eoa)[0].tolist()
-    if len(soa_idx) != len(eoa_idx):
-        raise ValueError(f"Invalid pairs of soa and eoa: Num of soa: {len(soa_idx)}, Num of eoa: {len(eoa_idx)}")
-    vocals_list = []
-    instrumentals_list = []
-    # If an audio prompt was used, skip the first pair.
-    range_begin = 1 if use_audio_prompt else 0
-    for i in range(range_begin, len(soa_idx)):
-        codec_ids = ids[soa_idx[i] + 1:eoa_idx[i]]
-        if codec_ids[0] == 32016:
-            codec_ids = codec_ids[1:]
-        # Ensure even length for reshaping into two tracks (vocal and instrumental)
-        codec_ids = codec_ids[:2 * (len(codec_ids) // 2)]
-        reshaped = rearrange(codec_ids, "(n b) -> b n", b=2)
-        vocals_ids = codectool.ids2npy(reshaped[0])
-        instrumentals_ids = codectool.ids2npy(reshaped[1])
-        vocals_list.append(vocals_ids)
-        instrumentals_list.append(instrumentals_ids)
-    # Concatenate segments in time dimension
-    vocals_codec = np.concatenate(vocals_list, axis=1)
-    instrumentals_codec = np.concatenate(instrumentals_list, axis=1)
-    print("Decoding audio on GPU...")
-    # Decode the codec arrays to waveforms using the CUDA helper function.
-    vocal_waveform = requires_cuda_decode(vocals_codec)
-    instrumental_waveform = requires_cuda_decode(instrumentals_codec)
-    # Mix the two waveforms (simple summation)
-    mixed_waveform = (vocal_waveform + instrumental_waveform) / 1.0
-    # Return the three audio outputs (mixed, vocal, instrumental) as tuples (sample_rate, np.array)
-    sample_rate = 16000
-    mixed_audio = save_audio(mixed_waveform, sample_rate, rescale)
-    vocal_audio = save_audio(vocal_waveform, sample_rate, rescale)
-    instrumental_audio = save_audio(instrumental_waveform, sample_rate, rescale)
-    return mixed_audio, vocal_audio, instrumental_audio
-# ---------------------------------------------------------------------
 # Gradio Interface
-# ---------------------------------------------------------------------
 with gr.Blocks() as demo:
     with gr.Column():
         gr.Markdown("# YuE: Open Music Foundation Models for Full-Song Generation")
@@ -360,6 +343,7 @@ with gr.Blocks() as demo:
                     instrumental_out = gr.Audio(label="Instrumental Audio")
         gr.Markdown("## Call for Contributions\nIf you find this space interesting please feel free to contribute.")
         submit_btn.click(
             fn=generate_music,
             inputs=[
@@ -373,6 +357,7 @@ with gr.Blocks() as demo:
             outputs=[music_out, vocal_out, instrumental_out]
         )
         gr.Examples(
             examples=[
                 [
@@ -419,4 +404,4 @@ Locked inside my mind, hot flame.
             fn=generate_music
         )
-demo.queue().launch(show_error=True)

 import gradio as gr
 import subprocess
 import os
 import shutil
+import tempfile
+import spaces
 import torch
 import sys
 import uuid
 from huggingface_hub import snapshot_download
+# Create xcodec_mini_infer folder
 folder_path = './xcodec_mini_infer'
+# Create the folder if it doesn't exist
 if not os.path.exists(folder_path):
     os.mkdir(folder_path)
     print(f"Folder created at: {folder_path}")
     local_dir="./xcodec_mini_infer"
 )
+# Change to the "inference" directory
 inference_dir = "."
 try:
     os.chdir(inference_dir)
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer'))
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer', 'descriptaudiocodec'))
+# don't change above code
+import argparse
 import numpy as np
 import json
 from omegaconf import OmegaConf
 import torchaudio
 from torchaudio.transforms import Resample
 import soundfile as sf
 from tqdm import tqdm
 from einops import rearrange
 from codecmanipulator import CodecManipulator
 from collections import Counter
 from models.soundstream_hubert_new import SoundStream
 device = "cuda:0"
+# Load model and tokenizer outside the generation function (load once)
 print("Loading model...")
 model = AutoModelForCausalLM.from_pretrained(
+    "m-a-p/YuE-s1-7B-anneal-en-cot",  # "m-a-p/YuE-s1-7B-anneal-en-icl",
     torch_dtype=torch.float16,
     attn_implementation="flash_attention_2",
 ).to(device)
 resume_path = './xcodec_mini_infer/final_ckpt/ckpt_00360000.pth'
 mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
 codectool = CodecManipulator("xcodec", 0, 1)
 model_config = OmegaConf.load(basic_model_config)
 # Load codec model
 codec_model = eval(model_config.generator.name)(**model_config.generator.config).to(device)
 parameter_dict = torch.load(resume_path, map_location='cpu')
 codec_model.eval()
 print("Codec model loaded.")
 class BlockTokenRangeProcessor(LogitsProcessor):
     def __init__(self, start_id, end_id):
         self.blocked_token_ids = list(range(start_id, end_id))
     structured_lyrics = [f"[{seg[0]}]\n{seg[1].strip()}\n\n" for seg in segments]
     return structured_lyrics
 @spaces.GPU(duration=175)
 def generate_music(
         genre_txt=None,
         lyrics_txt=None,
         run_n_segments=2,
+        max_new_tokens=15,
         use_audio_prompt=False,
         audio_prompt_path="",
         prompt_start_time=0.0,
         rescale=False,
 ):
     """
+    Generates music based on given genre and lyrics, optionally using an audio prompt.
+    This function orchestrates the music generation process, including prompt formatting,
+    model inference, and audio post-processing.
     """
     if use_audio_prompt and not audio_prompt_path:
         raise FileNotFoundError("Please provide an audio prompt file when 'Use Audio Prompt' is enabled!")
+    cuda_idx = cuda_idx
     max_new_tokens = max_new_tokens * 100
+    with tempfile.TemporaryDirectory() as output_dir:
+        stage1_output_dir = os.path.join(output_dir, f"stage1")
+        os.makedirs(stage1_output_dir, exist_ok=True)
+        stage1_output_set = []
+        genres = genre_txt.strip()
+        lyrics = split_lyrics(lyrics_txt + "\n")
+        # instruction
+        full_lyrics = "\n".join(lyrics)
+        prompt_texts = [f"Generate music from the given lyrics segment by segment.\n[Genre] {genres}\n{full_lyrics}"]
+        prompt_texts += lyrics
+        random_id = uuid.uuid4()
+        raw_output = None
+        # Decoding config
+        top_p = 0.93
+        temperature = 1.0
+        repetition_penalty = 1.2
+        start_of_segment = mmtokenizer.tokenize('[start_of_segment]')
+        end_of_segment = mmtokenizer.tokenize('[end_of_segment]')
+        # Format text prompt
+        run_n_segments = min(run_n_segments + 1, len(lyrics))
+        print(list(enumerate(tqdm(prompt_texts[:run_n_segments]))))
+        for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
+            section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
+            guidance_scale = 1.5 if i <= 1 else 1.2  # Guidance scale adjusted based on segment index
+            if i == 0:
+                continue
+            if i == 1:
+                if use_audio_prompt:
+                    audio_prompt = load_audio_mono(audio_prompt_path)
+                    audio_prompt.unsqueeze_(0)
+                    with torch.no_grad():
+                        raw_codes = codec_model.encode(audio_prompt.to(device), target_bw=0.5)
+                    raw_codes = raw_codes.transpose(0, 1)
+                    raw_codes = raw_codes.cpu().numpy().astype(np.int16)
+                    # Format audio prompt
+                    code_ids = codectool.npy2ids(raw_codes[0])
+                    audio_prompt_codec = code_ids[int(prompt_start_time * 50): int(prompt_end_time * 50)]  # 50 is tps of xcodec
+                    audio_prompt_codec_ids = [mmtokenizer.soa] + codectool.sep_ids + audio_prompt_codec + [
+                        mmtokenizer.eoa]
+                    sentence_ids = mmtokenizer.tokenize("[start_of_reference]") + audio_prompt_codec_ids + mmtokenizer.tokenize(
+                        "[end_of_reference]")
+                    head_id = mmtokenizer.tokenize(prompt_texts[0]) + sentence_ids
+                else:
+                    head_id = mmtokenizer.tokenize(prompt_texts[0])
+                prompt_ids = head_id + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
             else:
+                prompt_ids = end_of_segment + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
+            prompt_ids = torch.as_tensor(prompt_ids).unsqueeze(0).to(device)
+            input_ids = torch.cat([raw_output, prompt_ids], dim=1) if i > 1 else prompt_ids
+            # Use window slicing in case output sequence exceeds the context of model
+            max_context = 16384 - max_new_tokens - 1
+            if input_ids.shape[-1] > max_context:
+                print(
+                    f'Section {i}: output length {input_ids.shape[-1]} exceeding context length {max_context}, now using the last {max_context} tokens.')
+                input_ids = input_ids[:, -(max_context):]
+            with torch.inference_mode(), torch.autocast(device_type='cuda', dtype=torch.float16):
+                output_seq = model.generate(
+                    input_ids=input_ids,
+                    max_new_tokens=max_new_tokens,
+                    min_new_tokens=100,  # Keep min_new_tokens to avoid short generations
+                    do_sample=True,
+                    top_p=top_p,
+                    temperature=temperature,
+                    repetition_penalty=repetition_penalty,
+                    eos_token_id=mmtokenizer.eoa,
+                    pad_token_id=mmtokenizer.eoa,
+                    logits_processor=LogitsProcessorList([BlockTokenRangeProcessor(0, 32002), BlockTokenRangeProcessor(32016, 32016)]),
+                    guidance_scale=guidance_scale,
+                    use_cache=True,
+                    num_beams=2
+                )
+                if output_seq[0][-1].item() != mmtokenizer.eoa:
+                    tensor_eoa = torch.as_tensor([[mmtokenizer.eoa]]).to(model.device)
+                    output_seq = torch.cat((output_seq, tensor_eoa), dim=1)
+            output_seq = model_inference(input_ids, max_new_tokens, top_p, temperature, repetition_penalty, guidance_scale)
+            if i > 1:
+                raw_output = torch.cat([raw_output, prompt_ids, output_seq[:, input_ids.shape[-1]:]], dim=1)
+            else:
+                raw_output = output_seq
+            print(len(raw_output))
+        # save raw output and check sanity
+        ids = raw_output[0].cpu().numpy()
+        soa_idx = np.where(ids == mmtokenizer.soa)[0].tolist()
+        eoa_idx = np.where(ids == mmtokenizer.eoa)[0].tolist()
+        if len(soa_idx) != len(eoa_idx):
+            raise ValueError(f'invalid pairs of soa and eoa, Num of soa: {len(soa_idx)}, Num of eoa: {len(eoa_idx)}')
+        vocals = []
+        instrumentals = []
+        range_begin = 1 if use_audio_prompt else 0
+        for i in range(range_begin, len(soa_idx)):
+            codec_ids = ids[soa_idx[i] + 1:eoa_idx[i]]
+            if codec_ids[0] == 32016:
+                codec_ids = codec_ids[1:]
+            codec_ids = codec_ids[:2 * (len(codec_ids) // 2)]  # Ensure even length for reshape
+            vocals_ids = codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[0])
+            vocals.append(vocals_ids)
+            instrumentals_ids = codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[1])
+            instrumentals.append(instrumentals_ids)
+        vocals = np.concatenate(vocals, axis=1)
+        instrumentals = np.concatenate(instrumentals, axis=1)
+        vocal_save_path = os.path.join(stage1_output_dir, f"vocal_{random_id}".replace('.', '@') + '.npy')
+        inst_save_path = os.path.join(stage1_output_dir, f"instrumental_{random_id}".replace('.', '@') + '.npy')
+        np.save(vocal_save_path, vocals)
+        np.save(inst_save_path, instrumentals)
+        stage1_output_set.append(vocal_save_path)
+        stage1_output_set.append(inst_save_path)
+        print("Converting to Audio...")
+        # convert audio tokens to audio
+        def save_audio(wav: torch.Tensor, path, sample_rate: int, rescale: bool = False):
+            folder_path = os.path.dirname(path)
+            if not os.path.exists(folder_path):
+                os.makedirs(folder_path)
+            limit = 0.99
+            max_val = wav.abs().max()
+            wav = wav * min(limit / max_val, 1) if rescale else wav.clamp(-limit, limit)
+            torchaudio.save(str(path), wav, sample_rate=sample_rate, encoding='PCM_S', bits_per_sample=16)
+        # reconstruct tracks
+        recons_output_dir = os.path.join(output_dir, "recons")
+        recons_mix_dir = os.path.join(recons_output_dir, 'mix')
+        os.makedirs(recons_mix_dir, exist_ok=True)
+        tracks = []
+        for npy in stage1_output_set:
+            codec_result = np.load(npy)
+            decodec_rlt = []
+            with torch.no_grad():
+                decoded_waveform = codec_model.decode(
+                    torch.as_tensor(codec_result.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(device))
+            decoded_waveform = decoded_waveform.cpu().squeeze(0)
+            decodec_rlt.append(torch.as_tensor(decoded_waveform))
+            decodec_rlt = torch.cat(decodec_rlt, dim=-1)
+            save_path = os.path.join(recons_output_dir, os.path.splitext(os.path.basename(npy))[0] + ".mp3")  # Save as mp3 for gradio
+            tracks.append(save_path)
+            save_audio(decodec_rlt, save_path, 16000)
+        # mix tracks
+        for inst_path in tracks:
+            try:
+                if (inst_path.endswith('.wav') or inst_path.endswith('.mp3')) and 'instrumental' in inst_path:
+                    # find pair
+                    vocal_path = inst_path.replace('instrumental', 'vocal')
+                    if not os.path.exists(vocal_path):
+                        continue
+                    # mix
+                    recons_mix = os.path.join(recons_mix_dir, os.path.basename(inst_path).replace('instrumental', 'mixed'))
+                    vocal_stem, sr = sf.read(vocal_path)
+                    instrumental_stem, _ = sf.read(inst_path)
+                    mix_stem = (vocal_stem + instrumental_stem) / 1
+                    return (sr, (mix_stem * 32767).astype(np.int16)), (sr, (vocal_stem * 32767).astype(np.int16)), (sr, (instrumental_stem * 32767).astype(np.int16))
+            except Exception as e:
+                print(e)
+                return None, None, None
 # Gradio Interface
 with gr.Blocks() as demo:
     with gr.Column():
         gr.Markdown("# YuE: Open Music Foundation Models for Full-Song Generation")
                     instrumental_out = gr.Audio(label="Instrumental Audio")
         gr.Markdown("## Call for Contributions\nIf you find this space interesting please feel free to contribute.")
+        # When the "Submit" button is clicked, pass the additional audio-related inputs to the function.
         submit_btn.click(
             fn=generate_music,
             inputs=[
             outputs=[music_out, vocal_out, instrumental_out]
         )
+        # Examples updated to only include text inputs
         gr.Examples(
             examples=[
                 [
             fn=generate_music
         )
+demo.queue().launch(show_error=True)