YuE-music-generator-demo-zero

Paused

App Files Files Community

KingNish commited on Jan 31

Commit

310cc12

verified ·

1 Parent(s): 848a314

Update app.py

Browse files

Files changed (1) hide show

app.py +258 -232

app.py CHANGED Viewed

@@ -46,6 +46,7 @@ except FileNotFoundError:
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer'))
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer', 'descriptaudiocodec'))
 # don't change above code
 import argparse
@@ -66,35 +67,31 @@ import time
 import copy
 from collections import Counter
 from models.soundstream_hubert_new import SoundStream
-from vocoder import build_codec_model, process_audio # added vocoder back
-from post_process_audio import replace_low_freq_with_energy_matched # added post process back
 device = "cuda:0"
-# Stage 1 model
 model = AutoModelForCausalLM.from_pretrained(
     "m-a-p/YuE-s1-7B-anneal-en-cot",
     torch_dtype=torch.float16,
     attn_implementation="flash_attention_2",
-    # low_cpu_mem_usage=True,
 ).to(device)
 model.eval()
-# Stage 2 model
-stage2_model_path = "m-a-p/YuE-s2-1B-general"
-model_stage2 = AutoModelForCausalLM.from_pretrained(
-    stage2_model_path,
-    torch_dtype=torch.float16,
-    attn_implementation="flash_attention_2"
-    )
-model_stage2.to(device)
-model_stage2.eval()
 basic_model_config = './xcodec_mini_infer/final_ckpt/config.yaml'
 resume_path = './xcodec_mini_infer/final_ckpt/ckpt_00360000.pth'
-config_path = './xcodec_mini_infer/decoders/config.yaml' # added vocoder
-vocal_decoder_path = './xcodec_mini_infer/decoders/decoder_131000.pth' # added vocoder
-inst_decoder_path = './xcodec_mini_infer/decoders/decoder_151000.pth' # added vocoder
 mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
@@ -105,19 +102,170 @@ model_config = OmegaConf.load(basic_model_config)
 codec_model = eval(model_config.generator.name)(**model_config.generator.config).to(device)
 parameter_dict = torch.load(resume_path, map_location='cpu')
 codec_model.load_state_dict(parameter_dict['codec_model'])
-# codec_model = torch.compile(codec_model)
 codec_model.eval()
-# Preload and compile vocoders # added vocoder
 vocal_decoder, inst_decoder = build_codec_model(config_path, vocal_decoder_path, inst_decoder_path)
 vocal_decoder.to(device)
 inst_decoder.to(device)
-#vocal_decoder = torch.compile(vocal_decoder)
-#inst_decoder = torch.compile(inst_decoder)
 vocal_decoder.eval()
 inst_decoder.eval()
-@spaces.GPU(duration=150)
 def generate_music(
         max_new_tokens=5,
         run_n_segments=2,
@@ -141,31 +289,6 @@ def generate_music(
         os.makedirs(stage1_output_dir, exist_ok=True)
         os.makedirs(stage2_output_dir, exist_ok=True)
-        class BlockTokenRangeProcessor(LogitsProcessor):
-            def __init__(self, start_id, end_id):
-                self.blocked_token_ids = list(range(start_id, end_id))
-            def __call__(self, input_ids, scores):
-                scores[:, self.blocked_token_ids] = -float("inf")
-                return scores
-        def load_audio_mono(filepath, sampling_rate=16000):
-            audio, sr = torchaudio.load(filepath)
-            # Convert to mono
-            audio = torch.mean(audio, dim=0, keepdim=True)
-            # Resample if needed
-            if sr != sampling_rate:
-                resampler = Resample(orig_freq=sr, new_freq=sampling_rate)
-                audio = resampler(audio)
-            return audio
-        def split_lyrics(lyrics: str):
-            pattern = r"\[(\w+)\](.*?)\n(?=\[|\Z)"
-            segments = re.findall(pattern, lyrics, re.DOTALL)
-            structured_lyrics = [f"[{seg[0]}]\n{seg[1].strip()}\n\n" for seg in segments]
-            return structured_lyrics
-        # Call the function and print the result
         stage1_output_set = []
         genres = genre_txt.strip()
@@ -280,136 +403,8 @@ def generate_music(
         stage1_output_set.append(vocal_save_path)
         stage1_output_set.append(inst_save_path)
-        print("Stage 2 inference...") # stage 2 inference
-        def stage2_generate(model, prompt, batch_size=16):
-            codec_ids = codectool.unflatten(prompt, n_quantizer=1)
-            codec_ids = codectool.offset_tok_ids(
-                            codec_ids,
-                            global_offset=codectool.global_offset,
-                            codebook_size=codectool.codebook_size,
-                            num_codebooks=codectool.num_codebooks,
-                        ).astype(np.int32)
-            # Prepare prompt_ids based on batch size or single input
-            if batch_size > 1:
-                codec_list = []
-                for i in range(batch_size):
-                    idx_begin = i * 300
-                    idx_end = (i + 1) * 300
-                    codec_list.append(codec_ids[:, idx_begin:idx_end])
-                codec_ids = np.concatenate(codec_list, axis=0)
-                prompt_ids = np.concatenate(
-                    [
-                        np.tile([mmtokenizer.soa, mmtokenizer.stage_1], (batch_size, 1)),
-                        codec_ids,
-                        np.tile([mmtokenizer.stage_2], (batch_size, 1)),
-                    ],
-                    axis=1
-                )
-            else:
-                prompt_ids = np.concatenate([
-                    np.array([mmtokenizer.soa, mmtokenizer.stage_1]),
-                    codec_ids.flatten(),  # Flatten the 2D array to 1D
-                    np.array([mmtokenizer.stage_2])
-                ]).astype(np.int32)
-                prompt_ids = prompt_ids[np.newaxis, ...]
-            codec_ids = torch.as_tensor(codec_ids).to(device)
-            prompt_ids = torch.as_tensor(prompt_ids).to(device)
-            len_prompt = prompt_ids.shape[-1]
-            block_list = LogitsProcessorList([BlockTokenRangeProcessor(0, 46358), BlockTokenRangeProcessor(53526, mmtokenizer.vocab_size)])
-            # Teacher forcing generate loop
-            for frames_idx in range(codec_ids.shape[1]):
-                cb0 = codec_ids[:, frames_idx:frames_idx+1]
-                prompt_ids = torch.cat([prompt_ids, cb0], dim=1)
-                input_ids = prompt_ids
-                with torch.no_grad():
-                    stage2_output = model.generate(input_ids=input_ids,
-                        min_new_tokens=7,
-                        max_new_tokens=7,
-                        eos_token_id=mmtokenizer.eoa,
-                        pad_token_id=mmtokenizer.eoa,
-                        logits_processor=block_list,
-                    )
-                assert stage2_output.shape[1] - prompt_ids.shape[1] == 7, f"output new tokens={stage2_output.shape[1]-prompt_ids.shape[1]}"
-                prompt_ids = stage2_output
-            # Return output based on batch size
-            if batch_size > 1:
-                output = prompt_ids.cpu().numpy()[:, len_prompt:]
-                output_list = [output[i] for i in range(batch_size)]
-                output = np.concatenate(output_list, axis=0)
-            else:
-                output = prompt_ids[0].cpu().numpy()[len_prompt:]
-            return output
-        def stage2_inference(model, stage1_output_set, stage2_output_dir, batch_size=4):
-            stage2_result = []
-            for i in tqdm(range(len(stage1_output_set))):
-                output_filename = os.path.join(stage2_output_dir, os.path.basename(stage1_output_set[i]))
-                if os.path.exists(output_filename):
-                    print(f'{output_filename} stage2 has done.')
-                    continue
-                # Load the prompt
-                prompt = np.load(stage1_output_set[i]).astype(np.int32)
-                # Only accept 6s segments
-                output_duration = prompt.shape[-1] // 50 // 6 * 6
-                num_batch = output_duration // 6
-                if num_batch <= batch_size:
-                    # If num_batch is less than or equal to batch_size, we can infer the entire prompt at once
-                    output = stage2_generate(model, prompt[:, :output_duration*50], batch_size=num_batch)
-                else:
-                    # If num_batch is greater than batch_size, process in chunks of batch_size
-                    segments = []
-                    num_segments = (num_batch // batch_size) + (1 if num_batch % batch_size != 0 else 0)
-                    for seg in range(num_segments):
-                        start_idx = seg * batch_size * 300
-                        # Ensure the end_idx does not exceed the available length
-                        end_idx = min((seg + 1) * batch_size * 300, output_duration*50)  # Adjust the last segment
-                        current_batch_size = batch_size if seg != num_segments-1 or num_batch % batch_size == 0 else num_batch % batch_size
-                        segment = stage2_generate(
-                            model,
-                            prompt[:, start_idx:end_idx],
-                            batch_size=current_batch_size
-                        )
-                        segments.append(segment)
-                    # Concatenate all the segments
-                    output = np.concatenate(segments, axis=0)
-                # Process the ending part of the prompt
-                if output_duration*50 != prompt.shape[-1]:
-                    ending = stage2_generate(model, prompt[:, output_duration*50:], batch_size=1)
-                    output = np.concatenate([output, ending], axis=0)
-                output = codectool_stage2.ids2npy(output)
-                # Fix invalid codes (a dirty solution, which may harm the quality of audio)
-                # We are trying to find better one
-                fixed_output = copy.deepcopy(output)
-                for i, line in enumerate(output):
-                    for j, element in enumerate(line):
-                        if element < 0 or element > 1023:
-                            counter = Counter(line)
-                            most_frequant = sorted(counter.items(), key=lambda x: x[1], reverse=True)[0][0]
-                            fixed_output[i, j] = most_frequant
-                # save output
-                np.save(output_filename, fixed_output)
-                stage2_result.append(output_filename)
-            return stage2_result
-        stage2_result = stage2_inference(model_stage2, stage1_output_set, stage2_output_dir, batch_size=4)
-        print(stage2_result)
         print('Stage 2 DONE.\n')
         print("Converting to Audio...")
@@ -424,14 +419,14 @@ def generate_music(
             wav = wav * min(limit / max_val, 1) if rescale else wav.clamp(-limit, limit)
             torchaudio.save(str(path), wav, sample_rate=sample_rate, encoding='PCM_S', bits_per_sample=16)
-        # reconstruct tracks
-        recons_output_dir = os.path.join(output_dir, "recons")
         recons_mix_dir = os.path.join(recons_output_dir, 'mix')
         os.makedirs(recons_mix_dir, exist_ok=True)
-        tracks = []
-        for npy in stage2_result:
             codec_result = np.load(npy)
-            decodec_rlt = []
             with torch.no_grad():
                 decoded_waveform = codec_model.decode(
                     torch.as_tensor(codec_result.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(
@@ -439,42 +434,29 @@ def generate_music(
             decoded_waveform = decoded_waveform.cpu().squeeze(0)
             decodec_rlt.append(torch.as_tensor(decoded_waveform))
             decodec_rlt = torch.cat(decodec_rlt, dim=-1)
-            save_path = os.path.join(recons_output_dir, os.path.splitext(os.path.basename(npy))[0] + ".mp3")
-            tracks.append(save_path)
             save_audio(decodec_rlt, save_path, 16000)
-        # mix tracks
-        for inst_path in tracks:
-            try:
-                if (inst_path.endswith('.wav') or inst_path.endswith('.mp3')) \
-                        and 'instrumental' in inst_path:
-                    # find pair
-                    vocal_path = inst_path.replace('instrumental', 'vocal')
-                    if not os.path.exists(vocal_path):
-                        continue
-                    # mix
-                    recons_mix = os.path.join(recons_mix_dir, os.path.basename(inst_path).replace('instrumental', 'mixed'))
-                    vocal_stem, sr = sf.read(inst_path)
-                    instrumental_stem, _ = sf.read(vocal_path)
-                    mix_stem = (vocal_stem + instrumental_stem) / 1
-                    sf.write(recons_mix, mix_stem, sr) # saving 16k mix audio
-            except Exception as e:
-                print(e)
-        print("Upsampling audio...")
-        # vocoder to upsample audios
-        vocoder_output_dir = os.path.join(output_dir, 'vocoder')
-        vocoder_stems_dir = os.path.join(vocoder_output_dir, 'stems')
-        vocoder_mix_dir = os.path.join(vocoder_output_dir, 'mix')
-        os.makedirs(vocoder_mix_dir, exist_ok=True)
         os.makedirs(vocoder_stems_dir, exist_ok=True)
         for npy in stage2_result:
             if 'instrumental' in npy:
                 # Process instrumental
                 instrumental_output = process_audio(
                     npy,
-                    os.path.join(vocoder_stems_dir, 'instrumental.mp3'),
                     rescale,
-                    None,
                     inst_decoder,
                     codec_model
                 )
@@ -482,35 +464,78 @@ def generate_music(
                 # Process vocal
                 vocal_output = process_audio(
                     npy,
-                    os.path.join(vocoder_stems_dir, 'vocal.mp3'),
                     rescale,
-                    None,
                     vocal_decoder,
                     codec_model
                 )
-        # mix tracks
         try:
             mix_output = instrumental_output + vocal_output
-            vocoder_mix = os.path.join(vocoder_mix_dir, os.path.basename(recons_mix))
-            save_audio(mix_output, vocoder_mix, 44100, rescale) # saving 44.1k mix audio
             print(f"Created mix: {vocoder_mix}")
         except RuntimeError as e:
             print(e)
-            print(f"mix {vocoder_mix} failed! inst: {instrumental_output.shape}, vocal: {vocal_output.shape}")
-        # Post process
-        final_mix_path = os.path.join(output_dir, os.path.basename(recons_mix))
-        replace_low_freq_with_energy_matched(
-            a_file=recons_mix,     # 16kHz
-            b_file=vocoder_mix,     # 48kHz
-            c_file=final_mix_path,
-            cutoff_freq=5500.0
-        )
-        # return final mix, upsampled vocal stem, upsampled instrumental stem
-        return (44100, (mix_output.cpu().numpy() * 32767).astype(np.int16)), (44100, (vocal_output.cpu().numpy() * 32767).astype(np.int16)), (44100, (instrumental_output.cpu().numpy() * 32767).astype(np.int16))
-def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=5):
     # Execute the command
     try:
         mixed_audio_data, vocal_audio_data, instrumental_audio_data = generate_music(genre_txt=genre_txt_content, lyrics_txt=lyrics_txt_content, run_n_segments=num_segments,
@@ -522,6 +547,7 @@ def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=
     finally:
         print("Temporary files deleted.")
 # Gradio
 with gr.Blocks() as demo:
     with gr.Column():
@@ -549,10 +575,10 @@ with gr.Blocks() as demo:
                 max_new_tokens = gr.Slider(label="Duration of song", minimum=1, maximum=30, step=1, value=15, interactive=True)
                 submit_btn = gr.Button("Submit")
-                music_out = gr.Audio(label="Mixed Audio Result")
-                with gr.Accordion(label="Vocal and Instrumental Result", open=False):
-                    vocal_out = gr.Audio(label="Vocal Audio")
-                    instrumental_out = gr.Audio(label="Instrumental Audio")
         gr.Examples(
             examples=[

 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer'))
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer', 'descriptaudiocodec'))
 # don't change above code
 import argparse
 import copy
 from collections import Counter
 from models.soundstream_hubert_new import SoundStream
+from vocoder import build_codec_model, process_audio
+from post_process_audio import replace_low_freq_with_energy_matched
 device = "cuda:0"
+stage2_model = "m-a-p/YuE-s2-1B-general"
+model_stage2 = AutoModelForCausalLM.from_pretrained(
+    stage2_model,
+    torch_dtype=torch.float16,
+    attn_implementation="flash_attention_2"
+    ).to(device)
+model_stage2.eval()
 model = AutoModelForCausalLM.from_pretrained(
     "m-a-p/YuE-s1-7B-anneal-en-cot",
     torch_dtype=torch.float16,
     attn_implementation="flash_attention_2",
 ).to(device)
 model.eval()
 basic_model_config = './xcodec_mini_infer/final_ckpt/config.yaml'
 resume_path = './xcodec_mini_infer/final_ckpt/ckpt_00360000.pth'
+config_path = './xcodec_mini_infer/decoders/config.yaml'
+vocal_decoder_path = './xcodec_mini_infer/decoders/decoder_131000.pth'
+inst_decoder_path = './xcodec_mini_infer/decoders/decoder_151000.pth'
 mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
 codec_model = eval(model_config.generator.name)(**model_config.generator.config).to(device)
 parameter_dict = torch.load(resume_path, map_location='cpu')
 codec_model.load_state_dict(parameter_dict['codec_model'])
 codec_model.eval()
+# Preload and compile vocoders
 vocal_decoder, inst_decoder = build_codec_model(config_path, vocal_decoder_path, inst_decoder_path)
 vocal_decoder.to(device)
 inst_decoder.to(device)
 vocal_decoder.eval()
 inst_decoder.eval()
+class BlockTokenRangeProcessor(LogitsProcessor):
+    def __init__(self, start_id, end_id):
+        self.blocked_token_ids = list(range(start_id, end_id))
+    def __call__(self, input_ids, scores):
+        scores[:, self.blocked_token_ids] = -float("inf")
+        return scores
+def load_audio_mono(filepath, sampling_rate=16000):
+    audio, sr = torchaudio.load(filepath)
+    # Convert to mono
+    audio = torch.mean(audio, dim=0, keepdim=True)
+    # Resample if needed
+    if sr != sampling_rate:
+        resampler = Resample(orig_freq=sr, new_freq=sampling_rate)
+        audio = resampler(audio)
+    return audio
+def split_lyrics(lyrics: str):
+    pattern = r"\[(\w+)\](.*?)\n(?=\[|\Z)"
+    segments = re.findall(pattern, lyrics, re.DOTALL)
+    structured_lyrics = [f"[{seg[0]}]\n{seg[1].strip()}\n\n" for seg in segments]
+    return structured_lyrics
+def stage2_generate(model, prompt, batch_size=1): # set batch_size=1 for gradio demo
+    codec_ids = codectool.unflatten(prompt, n_quantizer=1)
+    codec_ids = codectool.offset_tok_ids(
+                    codec_ids,
+                    global_offset=codectool.global_offset,
+                    codebook_size=codectool.codebook_size,
+                    num_codebooks=codectool.num_codebooks,
+                ).astype(np.int32)
+    # Prepare prompt_ids based on batch size or single input
+    if batch_size > 1:
+        codec_list = []
+        for i in range(batch_size):
+            idx_begin = i * 300
+            idx_end = (i + 1) * 300
+            codec_list.append(codec_ids[:, idx_begin:idx_end])
+        codec_ids = np.concatenate(codec_list, axis=0)
+        prompt_ids = np.concatenate(
+            [
+                np.tile([mmtokenizer.soa, mmtokenizer.stage_1], (batch_size, 1)),
+                codec_ids,
+                np.tile([mmtokenizer.stage_2], (batch_size, 1)),
+            ],
+            axis=1
+        )
+    else:
+        prompt_ids = np.concatenate([
+            np.array([mmtokenizer.soa, mmtokenizer.stage_1]),
+            codec_ids.flatten(),  # Flatten the 2D array to 1D
+            np.array([mmtokenizer.stage_2])
+        ]).astype(np.int32)
+        prompt_ids = prompt_ids[np.newaxis, ...]
+    codec_ids = torch.as_tensor(codec_ids).to(device)
+    prompt_ids = torch.as_tensor(prompt_ids).to(device)
+    len_prompt = prompt_ids.shape[-1]
+    block_list = LogitsProcessorList([BlockTokenRangeProcessor(0, 46358), BlockTokenRangeProcessor(53526, mmtokenizer.vocab_size)])
+    # Teacher forcing generate loop
+    for frames_idx in range(codec_ids.shape[1]):
+        cb0 = codec_ids[:, frames_idx:frames_idx+1]
+        prompt_ids = torch.cat([prompt_ids, cb0], dim=1)
+        input_ids = prompt_ids
+        with torch.no_grad():
+            stage2_output = model.generate(input_ids=input_ids,
+                min_new_tokens=7,
+                max_new_tokens=7,
+                eos_token_id=mmtokenizer.eoa,
+                pad_token_id=mmtokenizer.eoa,
+                logits_processor=block_list,
+            )
+        assert stage2_output.shape[1] - prompt_ids.shape[1] == 7, f"output new tokens={stage2_output.shape[1]-prompt_ids.shape[1]}"
+        prompt_ids = stage2_output
+    # Return output based on batch size
+    if batch_size > 1:
+        output = prompt_ids.cpu().numpy()[:, len_prompt:]
+        output_list = [output[i] for i in range(batch_size)]
+        output = np.concatenate(output_list, axis=0)
+    else:
+        output = prompt_ids[0].cpu().numpy()[len_prompt:]
+    return output
+def stage2_inference(model, stage1_output_set, stage2_output_dir, batch_size=1): # set batch_size=1 for gradio demo
+    stage2_result = []
+    for i in tqdm(range(len(stage1_output_set))):
+        output_filename = os.path.join(stage2_output_dir, os.path.basename(stage1_output_set[i]))
+        if os.path.exists(output_filename):
+            print(f'{output_filename} stage2 has done.')
+            continue
+        # Load the prompt
+        prompt = np.load(stage1_output_set[i]).astype(np.int32)
+        # Only accept 6s segments
+        output_duration = prompt.shape[-1] // 50 // 6 * 6
+        num_batch = output_duration // 6
+        if num_batch <= batch_size:
+            # If num_batch is less than or equal to batch_size, we can infer the entire prompt at once
+            output = stage2_generate(model, prompt[:, :output_duration*50], batch_size=num_batch)
+        else:
+            # If num_batch is greater than batch_size, process in chunks of batch_size
+            segments = []
+            num_segments = (num_batch // batch_size) + (1 if num_batch % batch_size != 0 else 0)
+            for seg in range(num_segments):
+                start_idx = seg * batch_size * 300
+                # Ensure the end_idx does not exceed the available length
+                end_idx = min((seg + 1) * batch_size * 300, output_duration*50)  # Adjust the last segment
+                current_batch_size = batch_size if seg != num_segments-1 or num_batch % batch_size == 0 else num_batch % batch_size
+                segment = stage2_generate(
+                    model,
+                    prompt[:, start_idx:end_idx],
+                    batch_size=current_batch_size
+                )
+                segments.append(segment)
+            # Concatenate all the segments
+            output = np.concatenate(segments, axis=0)
+        # Process the ending part of the prompt
+        if output_duration*50 != prompt.shape[-1]:
+            ending = stage2_generate(model, prompt[:, output_duration*50:], batch_size=1)
+            output = np.concatenate([output, ending], axis=0)
+        output = codectool_stage2.ids2npy(output)
+        # Fix invalid codes (a dirty solution, which may harm the quality of audio)
+        # We are trying to find better one
+        fixed_output = copy.deepcopy(output)
+        for i, line in enumerate(output):
+            for j, element in enumerate(line):
+                if element < 0 or element > 1023:
+                    counter = Counter(line)
+                    most_frequant = sorted(counter.items(), key=lambda x: x[1], reverse=True)[0][0]
+                    fixed_output[i, j] = most_frequant
+        # save output
+        np.save(output_filename, fixed_output)
+        stage2_result.append(output_filename)
+    return stage2_result
+@spaces.GPU(duration=120)
 def generate_music(
         max_new_tokens=5,
         run_n_segments=2,
         os.makedirs(stage1_output_dir, exist_ok=True)
         os.makedirs(stage2_output_dir, exist_ok=True)
         stage1_output_set = []
         genres = genre_txt.strip()
         stage1_output_set.append(vocal_save_path)
         stage1_output_set.append(inst_save_path)
+        print("Stage 2 inference...")
+        stage2_result = stage2_inference(model_stage2, stage1_output_set, stage2_output_dir, batch_size=1) # set batch_size=1 for gradio demo
         print('Stage 2 DONE.\n')
         print("Converting to Audio...")
             wav = wav * min(limit / max_val, 1) if rescale else wav.clamp(-limit, limit)
             torchaudio.save(str(path), wav, sample_rate=sample_rate, encoding='PCM_S', bits_per_sample=16)
+        # reconstruct tracks from stage 1
+        recons_output_dir = os.path.join(output_dir, "recons_stage1") # changed folder name to recons_stage1
         recons_mix_dir = os.path.join(recons_output_dir, 'mix')
         os.makedirs(recons_mix_dir, exist_ok=True)
+        tracks_stage1 = [] # changed variable name to tracks_stage1
+        for npy in stage1_output_set:
             codec_result = np.load(npy)
+            decodec_rlt=[]
             with torch.no_grad():
                 decoded_waveform = codec_model.decode(
                     torch.as_tensor(codec_result.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(
             decoded_waveform = decoded_waveform.cpu().squeeze(0)
             decodec_rlt.append(torch.as_tensor(decoded_waveform))
             decodec_rlt = torch.cat(decodec_rlt, dim=-1)
+            save_path = os.path.join(recons_output_dir, os.path.splitext(os.path.basename(npy))[0] + "_stage1.mp3") # changed filename to include _stage1
+            tracks_stage1.append(save_path) # changed variable name to tracks_stage1
             save_audio(decodec_rlt, save_path, 16000)
+        # reconstruct tracks from stage 2 and vocoder
+        recons_output_dir = os.path.join(output_dir, "recons_stage2_vocoder") # changed folder name to recons_stage2_vocoder
+        recons_mix_dir = os.path.join(recons_output_dir, 'mix')
+        os.makedirs(recons_mix_dir, exist_ok=True)
+        tracks_stage2_vocoder = [] # changed variable name to tracks_stage2_vocoder
+        vocoder_stems_dir = os.path.join(recons_output_dir, 'stems') # vocoder output stems in recons_stage2_vocoder
         os.makedirs(vocoder_stems_dir, exist_ok=True)
+        vocal_output = None # initialize for mix error handling
+        instrumental_output = None # initialize for mix error handling
         for npy in stage2_result:
             if 'instrumental' in npy:
                 # Process instrumental
                 instrumental_output = process_audio(
                     npy,
+                    os.path.join(vocoder_stems_dir, 'instrumental.mp3'), # vocoder output to vocoder_stems_dir
                     rescale,
+                    None, # Removed args, use default vocoder args
                     inst_decoder,
                     codec_model
                 )
                 # Process vocal
                 vocal_output = process_audio(
                     npy,
+                    os.path.join(vocoder_stems_dir, 'vocal.mp3'), # vocoder output to vocoder_stems_dir
                     rescale,
+                    None, # Removed args, use default vocoder args
                     vocal_decoder,
                     codec_model
                 )
+        # mix tracks from vocoder output
         try:
             mix_output = instrumental_output + vocal_output
+            vocoder_mix = os.path.join(recons_mix_dir, 'mixed_stage2_vocoder.mp3') # mixed output in recons_stage2_vocoder, changed filename
+            save_audio(mix_output, vocoder_mix, 44100, rescale)
             print(f"Created mix: {vocoder_mix}")
+            tracks_stage2_vocoder.append(vocoder_mix) # add mixed vocoder output path
         except RuntimeError as e:
             print(e)
+            vocoder_mix = None # set to None if mix failed
+            print(f"mix {vocoder_mix} failed! inst: {instrumental_output.shape if instrumental_output is not None else 'None'}, vocal: {vocal_output.shape if vocal_output is not None else 'None'}")
+        # mix tracks from stage 1
+        mixed_stage1_path = None
+        vocal_stage1_path = None
+        instrumental_stage1_path = None
+        for inst_path in tracks_stage1: # changed variable name to tracks_stage1
+            try:
+                if (inst_path.endswith('.wav') or inst_path.endswith('.mp3')) \
+                        and 'instrumental' in inst_path:
+                    # find pair
+                    vocal_path = inst_path.replace('instrumental', 'vocal')
+                    if not os.path.exists(vocal_path):
+                        continue
+                    # mix
+                    recons_mix = os.path.join(recons_mix_dir, os.path.basename(inst_path).replace('instrumental_stage1', 'mixed_stage1')) # changed mixed filename
+                    vocal_stem, sr = sf.read(vocal_path)
+                    instrumental_stem, _ = sf.read(inst_path)
+                    mix_stem = (vocal_stem + instrumental_stem) / 1
+                    sf.write(recons_mix, mix_stem, sr)
+                    mixed_stage1_path = recons_mix # store mixed stage 1 path
+                    vocal_stage1_path = vocal_path # store vocal stage 1 path
+                    instrumental_stage1_path = inst_path # store instrumental stage 1 path
+            except Exception as e:
+                print(e)
+        # Post process - skip post process for gradio to simplify.
+        # recons_mix_final_path = os.path.join(output_dir, os.path.basename(mixed_stage1_path).replace('_stage1', '_final')) # final output path
+        # replace_low_freq_with_energy_matched(
+        #     a_file=mixed_stage1_path,     # 16kHz
+        #     b_file=vocoder_mix,     # 48kHz
+        #     c_file=recons_mix_final_path,
+        #     cutoff_freq=5500.0
+        # )
+        if vocoder_mix is not None: # return vocoder mix if successful
+            mixed_audio_data, sr_vocoder_mix = sf.read(vocoder_mix)
+            vocal_audio_data = None # stage 2 vocoder stems are not mixed and returned in this demo, set to None
+            instrumental_audio_data = None # stage 2 vocoder stems are not mixed and returned in this demo, set to None
+            return (sr_vocoder_mix, (mixed_audio_data * 32767).astype(np.int16)), vocal_audio_data, instrumental_audio_data
+        elif mixed_stage1_path is not None: # if vocoder failed, return stage 1 mix
+            mixed_audio_data_stage1, sr_stage1_mix = sf.read(mixed_stage1_path)
+            vocal_audio_data_stage1, sr_vocal_stage1 = sf.read(vocal_stage1_path)
+            instrumental_audio_data_stage1, sr_inst_stage1 = sf.read(instrumental_stage1_path)
+            return (sr_stage1_mix, (mixed_audio_data_stage1 * 32767).astype(np.int16)), (sr_vocal_stage1, (vocal_audio_data_stage1 * 32767).astype(np.int16)), (sr_inst_stage1, (instrumental_audio_data_stage1 * 32767).astype(np.int16))
+        else: # if both failed, return None
+             return None, None, None
+def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=15):
     # Execute the command
     try:
         mixed_audio_data, vocal_audio_data, instrumental_audio_data = generate_music(genre_txt=genre_txt_content, lyrics_txt=lyrics_txt_content, run_n_segments=num_segments,
     finally:
         print("Temporary files deleted.")
 # Gradio
 with gr.Blocks() as demo:
     with gr.Column():
                 max_new_tokens = gr.Slider(label="Duration of song", minimum=1, maximum=30, step=1, value=15, interactive=True)
                 submit_btn = gr.Button("Submit")
+                music_out = gr.Audio(label="Mixed Audio Result (Stage 2 + Vocoder)")
+                with gr.Accordion(label="Stage 1 Vocal and Instrumental Result", open=False):
+                    vocal_out = gr.Audio(label="Vocal Audio (Stage 1)")
+                    instrumental_out = gr.Audio(label="Instrumental Audio (Stage 1)")
         gr.Examples(
             examples=[