YuE-music-generator-demo-zero

Paused

App Files Files Community

KingNish commited on Jan 29

Commit

649509e

verified ·

1 Parent(s): 8cd422c

Update app.py

Browse files

Files changed (1) hide show

app.py +435 -211

app.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import gradio as gr
 import subprocess
-import os
 import shutil
 import tempfile
 import spaces
 import sys
-import re
 print("Installing flash-attn...")
 # Install flash attention
@@ -43,216 +44,448 @@ except FileNotFoundError:
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer'))
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer', 'descriptaudiocodec'))
-import gradio as gr
-import os
-import shutil
-import tempfile
-import spaces
-import torch
 import numpy as np
-from pathlib import Path
-from huggingface_hub import snapshot_download
 from omegaconf import OmegaConf
 import torchaudio
 import soundfile as sf
-from functools import lru_cache
-from concurrent.futures import ThreadPoolExecutor
-from transformers import AutoTokenizer, AutoModelForCausalLM, LogitsProcessorList
-from models.soundstream_hubert_new import SoundStream
-from vocoder import build_codec_model
-from mmtokenizer import _MMSentencePieceTokenizer
 from codecmanipulator import CodecManipulator
-# --------------------------
-# Configuration Constants
-# --------------------------
-MODEL_DIR = Path("./xcodec_mini_infer")
-OUTPUT_DIR = Path("./output")
-DEVICE = "cuda:0"
-TORCH_DTYPE = torch.float16
-MAX_CONTEXT = 16384 - 3000 - 1
-MAX_SEQ_LEN = 16384
-# --------------------------
-# Preload Models with KV Cache Initialization
-# --------------------------
-# Text generation model with KV cache support
-model = AutoModelForCausalLM.from_pretrained(
-    "m-a-p/YuE-s1-7B-anneal-en-cot",
-    torch_dtype=TORCH_DTYPE,
-    attn_implementation="flash_attention_2",
-    use_cache=True  # Enable KV caching
-).to(DEVICE).eval()
-# Tokenizer and codec tools
-mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
-codectool = CodecManipulator("xcodec", 0, 1)
-# Audio codec model
-model_config = OmegaConf.load(MODEL_DIR/"final_ckpt/config.yaml")
-codec_model = SoundStream(**model_config.generator.config).to(DEVICE)
-codec_model.load_state_dict(
-    torch.load(MODEL_DIR/"final_ckpt/ckpt_00360000.pth", map_location='cpu')['codec_model']
-)
-codec_model.eval()
-# Vocoders
-vocal_decoder, inst_decoder = build_codec_model(
-    MODEL_DIR/"decoders/config.yaml",
-    MODEL_DIR/"decoders/decoder_131000.pth",
-    MODEL_DIR/"decoders/decoder_151000.pth"
-)
-# --------------------------
-# Optimized Generation with KV Cache Management
-# --------------------------
-class KVCacheManager:
-    def __init__(self, model):
-        self.model = model
-        self.past_key_values = None
-        self.current_length = 0
-    def reset(self):
-        self.past_key_values = None
-        self.current_length = 0
-    def generate_with_cache(self, input_ids, generation_config):
-        outputs = self.model(
-            input_ids,
-            past_key_values=self.past_key_values,
-            use_cache=True,
-            output_hidden_states=False,
-            return_dict=True
-        )
-        self.past_key_values = outputs.past_key_values
-        self.current_length += input_ids.shape[1]
-        return outputs.logits
-def split_lyrics(lyrics: str):
-    pattern = r"\[(\w+)\](.*?)\n(?=\[|\Z)"
-    segments = re.findall(pattern, lyrics, re.DOTALL)
-    return [f"[{seg[0]}]\n{seg[1].strip()}\n\n" for seg in segments]
-@torch.inference_mode()
-def process_audio_batch(codec_ids, decoder, sample_rate=44100):
-    decoded = codec_model.decode(
-        torch.as_tensor(codec_ids.astype(np.int16), dtype=torch.long)
-        .unsqueeze(0).permute(1, 0, 2).to(DEVICE)
     )
-    return decoded.cpu().squeeze(0)
-# --------------------------
-# Core Generation Logic with KV Cache
-# --------------------------
-def generate_music(genre_txt, lyrics_txt, num_segments=2, max_new_tokens=2000):
-    # Initialize KV cache manager
-    cache_manager = KVCacheManager(model)
-    # Preprocess inputs
-    genres = genre_txt.strip()
-    structured_lyrics = split_lyrics(lyrics_txt+"\n")
-    prompt_texts = [f"Generate music from the given lyrics segment by segment.\n[Genre] {genres}\n{''.join(structured_lyrics)}"] + structured_lyrics
-    # Generation loop with KV cache
-    all_generated = []
-    for i in range(1, min(num_segments+1, len(prompt_texts))):
-        input_ids = prepare_inputs(prompt_texts, i, all_generated)
-        input_ids = input_ids.to(DEVICE)
-        # Generate segment with KV cache
-        segment_output = []
-        for _ in range(max_new_tokens):
-            logits = cache_manager.generate_with_cache(input_ids, None)
-            # Sampling logic
-            probs = torch.nn.functional.softmax(logits[:, -1], dim=-1)
-            next_token = torch.multinomial(probs, num_samples=1)
-            segment_output.append(next_token.item())
-            input_ids = next_token.unsqueeze(0)
-            if next_token == mmtokenizer.eoa:
-                break
-        all_generated.extend(segment_output)
-        # Prevent cache overflow
-        if cache_manager.current_length > MAX_SEQ_LEN * 0.8:
-            cache_manager.reset()
-    # Process outputs
-    ids = np.array(all_generated)
-    vocals, instrumentals = process_outputs(ids)
-    # Parallel audio processing
-    with ThreadPoolExecutor() as executor:
-        vocal_future = executor.submit(process_audio_batch, vocals, vocal_decoder)
-        inst_future = executor.submit(process_audio_batch, instrumentals, inst_decoder)
-        vocal_wav = vocal_future.result()
-        inst_wav = inst_future.result()
-    # Mix and post-process
-    mixed = (vocal_wav + inst_wav) / 2
-    final_path = OUTPUT_DIR/"final_output.mp3"
-    save_audio(mixed, final_path, 44100)
-    return str(final_path)
-# --------------------------
-# Optimized Helper Functions
-# --------------------------
-@lru_cache(maxsize=10)
-def prepare_inputs(prompt_texts, index, previous_tokens):
-    current_prompt = mmtokenizer.tokenize(prompt_texts[index])
-    return torch.tensor([previous_tokens + current_prompt], dtype=torch.long, device=DEVICE)
-def process_outputs(ids):
     soa_idx = np.where(ids == mmtokenizer.soa)[0].tolist()
     eoa_idx = np.where(ids == mmtokenizer.eoa)[0].tolist()
     vocals = []
     instrumentals = []
-    for i in range(len(soa_idx)):
         codec_ids = ids[soa_idx[i]+1:eoa_idx[i]]
         codec_ids = codec_ids[:2 * (codec_ids.shape[0] // 2)]
-        vocals.append(codectool.ids2npy(codec_ids[::2]))
-        instrumentals.append(codectool.ids2npy(codec_ids[1::2]))
-    return np.concatenate(vocals, axis=1), np.concatenate(instrumentals, axis=1)
-def save_audio(wav, path, sr):
-    wav = wav.clamp(-0.99, 0.99)
-    torchaudio.save(path, wav.cpu(), sr, encoding='PCM_S', bits_per_sample=16)
-# --------------------------
-# Gradio Interface
-# --------------------------
 @spaces.GPU(duration=120)
-def infer(genre, lyrics, num_segments=2, max_tokens=2000):
-    with tempfile.TemporaryDirectory() as tmpdir:
-        return generate_music(genre, lyrics, num_segments, max_tokens)
-# Gradio UI
-with gr.Blocks() as demo:
-    gr.Markdown("# YuE Music Generator with KV Cache Optimization")
-    with gr.Row():
-        with gr.Column():
-            genre_txt = gr.Textbox(label="Genre", placeholder="e.g., pop electronic female vocal")
-            lyrics_txt = gr.Textbox(label="Lyrics", lines=8,
-                                  placeholder="""[verse]\nYour lyrics here...""")
-            num_segments = gr.Slider(1, 10, value=2, label="Song Segments")
-            max_tokens = gr.Slider(100, 3000, value=1000, step=100,
-                                 label="Max Tokens per Segment (100≈1sec)")
-            submit_btn = gr.Button("Generate Music")
-        with gr.Column():
-            audio_output = gr.Audio(label="Generated Music", interactive=False)
-    gr.Examples(
-        examples=[
-            ["pop rock male vocal", """[verse]
 Woke up in the morning, sun is shining bright
 Chasing all my dreams, gotta get my mind right
 City lights are fading, but my vision's clear
@@ -266,29 +499,20 @@ Building up my future with my own two hands
 This is my life, and I'm aiming for the top
 Never gonna quit, no, I'm never gonna stop
 Through the highs and lows, I'mma keep it real
-Living out my dreams with this mic and a deal"""],
-            ["electronic dance synth female", """
-[verse]
-In the quiet of the evening, shadows start to fall
-Whispers of the night wind echo through the hall
-Lost within the silence, I hear your gentle voice
-Guiding me back homeward, making my heart rejoice
-[chorus]
-Don't let this moment fade, hold me close tonight
-With you here beside me, everything's alright
-Can't imagine life alone, don't want to let you go
-Stay with me forever, let our love just flow
-"""]
-        ],
-        inputs=[genre_txt, lyrics_txt],
-        outputs=audio_output
-    )
     submit_btn.click(
-        fn=infer,
-        inputs=[genre_txt, lyrics_txt, num_segments, max_tokens],
-        outputs=audio_output
     )
-demo.queue().launch()

 import gradio as gr
 import subprocess
+import os
 import shutil
 import tempfile
 import spaces
+import torch
+import os
 import sys
 print("Installing flash-attn...")
 # Install flash attention
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer'))
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer', 'descriptaudiocodec'))
+import argparse
 import numpy as np
+import json
 from omegaconf import OmegaConf
 import torchaudio
+from torchaudio.transforms import Resample
 import soundfile as sf
+import uuid
+from tqdm import tqdm
+from einops import rearrange
 from codecmanipulator import CodecManipulator
+from mmtokenizer import _MMSentencePieceTokenizer
+from transformers import AutoTokenizer, AutoModelForCausalLM, LogitsProcessor, LogitsProcessorList
+import glob
+import time
+import copy
+from collections import Counter
+from models.soundstream_hubert_new import SoundStream
+from vocoder import build_codec_model, process_audio
+from post_process_audio import replace_low_freq_with_energy_matched
+import re
+is_shared_ui = True if "innova-ai/YuE-music-generator-demo" in os.environ['SPACE_ID'] else False
+def empty_output_folder(output_dir):
+    # List all files in the output directory
+    files = os.listdir(output_dir)
+    # Iterate over the files and remove them
+    for file in files:
+        file_path = os.path.join(output_dir, file)
+        try:
+            if os.path.isdir(file_path):
+                # If it's a directory, remove it recursively
+                shutil.rmtree(file_path)
+            else:
+                # If it's a file, delete it
+                os.remove(file_path)
+        except Exception as e:
+            print(f"Error deleting file {file_path}: {e}")
+# Function to create a temporary file with string content
+def create_temp_file(content, prefix, suffix=".txt"):
+    temp_file = tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=prefix, suffix=suffix)
+    # Ensure content ends with newline and normalize line endings
+    content = content.strip() + "\n\n"  # Add extra newline at end
+    content = content.replace("\r\n", "\n").replace("\r", "\n")
+    temp_file.write(content)
+    temp_file.close()
+    # Debug: Print file contents
+    print(f"\nContent written to {prefix}{suffix}:")
+    print(content)
+    print("---")
+    return temp_file.name
+def get_last_mp3_file(output_dir):
+    # List all files in the output directory
+    files = os.listdir(output_dir)
+    # Filter only .mp3 files
+    mp3_files = [file for file in files if file.endswith('.mp3')]
+    if not mp3_files:
+        print("No .mp3 files found in the output folder.")
+        return None
+    # Get the full path for the mp3 files
+    mp3_files_with_path = [os.path.join(output_dir, file) for file in mp3_files]
+    # Sort the files based on the modification time (most recent first)
+    mp3_files_with_path.sort(key=lambda x: os.path.getmtime(x), reverse=True)
+    # Return the most recent .mp3 file
+    return mp3_files_with_path[0]
+device = "cuda:0"
+model = AutoModelForCausalLM.from_pretrained(
+    "m-a-p/YuE-s1-7B-anneal-en-cot",
+    torch_dtype=torch.float16,
+    attn_implementation="flash_attention_2", # To enable flashattn, you have to install flash-attn
     )
+model.to(device)
+model.eval()
+def generate_music(
+    stage1_model="m-a-p/YuE-s1-7B-anneal-en-cot",
+    max_new_tokens=3000,
+    run_n_segments=2,
+    genre_txt=None,
+    lyrics_txt=None,
+    use_audio_prompt=False,
+    audio_prompt_path="",
+    prompt_start_time=0.0,
+    prompt_end_time=30.0,
+    output_dir="./output",
+    keep_intermediate=False,
+    disable_offload_model=False,
+    cuda_idx=0,
+    basic_model_config='./xcodec_mini_infer/final_ckpt/config.yaml',
+    resume_path='./xcodec_mini_infer/final_ckpt/ckpt_00360000.pth',
+    config_path='./xcodec_mini_infer/decoders/config.yaml',
+    vocal_decoder_path='./xcodec_mini_infer/decoders/decoder_131000.pth',
+    inst_decoder_path='./xcodec_mini_infer/decoders/decoder_151000.pth',
+    rescale=False,
+):
+    if use_audio_prompt and not audio_prompt_path:
+        raise FileNotFoundError("Please offer audio prompt filepath using '--audio_prompt_path', when you enable 'use_audio_prompt'!")
+    model = stage1_model
+    cuda_idx = cuda_idx
+    max_new_tokens = max_new_tokens
+    stage1_output_dir = os.path.join(output_dir, f"stage1")
+    os.makedirs(stage1_output_dir, exist_ok=True)
+    mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
+    codectool = CodecManipulator("xcodec", 0, 1)
+    model_config = OmegaConf.load(basic_model_config)
+    codec_model = eval(model_config.generator.name)(**model_config.generator.config).to(device)
+    parameter_dict = torch.load(resume_path, map_location='cpu')
+    codec_model.load_state_dict(parameter_dict['codec_model'])
+    codec_model.to(device)
+    codec_model.eval()
+    class BlockTokenRangeProcessor(LogitsProcessor):
+        def __init__(self, start_id, end_id):
+            self.blocked_token_ids = list(range(start_id, end_id))
+        def __call__(self, input_ids, scores):
+            scores[:, self.blocked_token_ids] = -float("inf")
+            return scores
+    def load_audio_mono(filepath, sampling_rate=16000):
+        audio, sr = torchaudio.load(filepath)
+        # Convert to mono
+        audio = torch.mean(audio, dim=0, keepdim=True)
+        # Resample if needed
+        if sr != sampling_rate:
+            resampler = Resample(orig_freq=sr, new_freq=sampling_rate)
+            audio = resampler(audio)
+        return audio
+    def split_lyrics(lyrics: str):
+        pattern = r"\[(\w+)\](.*?)\n(?=\[|\Z)"
+        segments = re.findall(pattern, lyrics, re.DOTALL)
+        structured_lyrics = [f"[{seg[0]}]\n{seg[1].strip()}\n\n" for seg in segments]
+        return structured_lyrics
+    # Call the function and print the result
+    stage1_output_set = []
+    genres = genre_txt.strip()
+    lyrics = split_lyrics(lyrics_txt+"\n")
+    # intruction
+    full_lyrics = "\n".join(lyrics)
+    prompt_texts = [f"Generate music from the given lyrics segment by segment.\n[Genre] {genres}\n{full_lyrics}"]
+    prompt_texts += lyrics
+    random_id = uuid.uuid4()
+    output_seq = None
+    # Here is suggested decoding config
+    top_p = 0.93
+    temperature = 1.0
+    repetition_penalty = 1.2
+    # special tokens
+    start_of_segment = mmtokenizer.tokenize('[start_of_segment]')
+    end_of_segment = mmtokenizer.tokenize('[end_of_segment]')
+    raw_output = None
+    # Format text prompt
+    run_n_segments = min(run_n_segments+1, len(lyrics))
+    print(list(enumerate(tqdm(prompt_texts[:run_n_segments]))))
+    for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
+        section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
+        guidance_scale = 1.5 if i <=1 else 1.2
+        if i==0:
+            continue
+        if i==1:
+            if use_audio_prompt:
+                audio_prompt = load_audio_mono(audio_prompt_path)
+                audio_prompt.unsqueeze_(0)
+                with torch.no_grad():
+                    raw_codes = codec_model.encode(audio_prompt.to(device), target_bw=0.5)
+                raw_codes = raw_codes.transpose(0, 1)
+                raw_codes = raw_codes.cpu().numpy().astype(np.int16)
+                # Format audio prompt
+                code_ids = codectool.npy2ids(raw_codes[0])
+                audio_prompt_codec = code_ids[int(prompt_start_time *50): int(prompt_end_time *50)] # 50 is tps of xcodec
+                audio_prompt_codec_ids = [mmtokenizer.soa] + codectool.sep_ids + audio_prompt_codec + [mmtokenizer.eoa]
+                sentence_ids = mmtokenizer.tokenize("[start_of_reference]") +  audio_prompt_codec_ids + mmtokenizer.tokenize("[end_of_reference]")
+                head_id = mmtokenizer.tokenize(prompt_texts[0]) + sentence_ids
+            else:
+                head_id = mmtokenizer.tokenize(prompt_texts[0])
+            prompt_ids = head_id + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
+        else:
+            prompt_ids = end_of_segment + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
+        prompt_ids = torch.as_tensor(prompt_ids).unsqueeze(0).to(device)
+        input_ids = torch.cat([raw_output, prompt_ids], dim=1) if i > 1 else prompt_ids
+        # Use window slicing in case output sequence exceeds the context of model
+        max_context = 16384-max_new_tokens-1
+        if input_ids.shape[-1] > max_context:
+            print(f'Section {i}: output length {input_ids.shape[-1]} exceeding context length {max_context}, now using the last {max_context} tokens.')
+            input_ids = input_ids[:, -(max_context):]
+        with torch.no_grad():
+            output_seq = model.generate(
+                input_ids=input_ids,
+                max_new_tokens=max_new_tokens,
+                min_new_tokens=100,
+                do_sample=True,
+                top_p=top_p,
+                temperature=temperature,
+                repetition_penalty=repetition_penalty,
+                eos_token_id=mmtokenizer.eoa,
+                pad_token_id=mmtokenizer.eoa,
+                logits_processor=LogitsProcessorList([BlockTokenRangeProcessor(0, 32002), BlockTokenRangeProcessor(32016, 32016)]),
+                guidance_scale=guidance_scale,
+                )
+            if output_seq[0][-1].item() != mmtokenizer.eoa:
+                tensor_eoa = torch.as_tensor([[mmtokenizer.eoa]]).to(model.device)
+                output_seq = torch.cat((output_seq, tensor_eoa), dim=1)
+        if i > 1:
+            raw_output = torch.cat([raw_output, prompt_ids, output_seq[:, input_ids.shape[-1]:]], dim=1)
+        else:
+            raw_output = output_seq
+        print(len(raw_output))
+    # save raw output and check sanity
+    ids = raw_output[0].cpu().numpy()
     soa_idx = np.where(ids == mmtokenizer.soa)[0].tolist()
     eoa_idx = np.where(ids == mmtokenizer.eoa)[0].tolist()
+    if len(soa_idx)!=len(eoa_idx):
+        raise ValueError(f'invalid pairs of soa and eoa, Num of soa: {len(soa_idx)}, Num of eoa: {len(eoa_idx)}')
     vocals = []
     instrumentals = []
+    range_begin = 1 if use_audio_prompt else 0
+    for i in range(range_begin, len(soa_idx)):
         codec_ids = ids[soa_idx[i]+1:eoa_idx[i]]
+        if codec_ids[0] == 32016:
+            codec_ids = codec_ids[1:]
         codec_ids = codec_ids[:2 * (codec_ids.shape[0] // 2)]
+        vocals_ids = codectool.ids2npy(rearrange(codec_ids,"(n b) -> b n", b=2)[0])
+        vocals.append(vocals_ids)
+        instrumentals_ids = codectool.ids2npy(rearrange(codec_ids,"(n b) -> b n", b=2)[1])
+        instrumentals.append(instrumentals_ids)
+    vocals = np.concatenate(vocals, axis=1)
+    instrumentals = np.concatenate(instrumentals, axis=1)
+    vocal_save_path = os.path.join(stage1_output_dir, f"cot_{genres.replace(' ', '-')}_tp{top_p}_T{temperature}_rp{repetition_penalty}_maxtk{max_new_tokens}_vocal_{random_id}".replace('.', '@')+'.npy')
+    inst_save_path = os.path.join(stage1_output_dir, f"cot_{genres.replace(' ', '-')}_tp{top_p}_T{temperature}_rp{repetition_penalty}_maxtk{max_new_tokens}_instrumental_{random_id}".replace('.', '@')+'.npy')
+    np.save(vocal_save_path, vocals)
+    np.save(inst_save_path, instrumentals)
+    stage1_output_set.append(vocal_save_path)
+    stage1_output_set.append(inst_save_path)
+    # offload model
+    if not disable_offload_model:
+        model.cpu()
+        del model
+        torch.cuda.empty_cache()
+    print("Converting to Audio...")
+    # convert audio tokens to audio
+    def save_audio(wav: torch.Tensor, path, sample_rate: int, rescale: bool = False):
+        folder_path = os.path.dirname(path)
+        if not os.path.exists(folder_path):
+            os.makedirs(folder_path)
+        limit = 0.99
+        max_val = wav.abs().max()
+        wav = wav * min(limit / max_val, 1) if rescale else wav.clamp(-limit, limit)
+        torchaudio.save(str(path), wav, sample_rate=sample_rate, encoding='PCM_S', bits_per_sample=16)
+    # reconstruct tracks
+    recons_output_dir = os.path.join(output_dir, "recons")
+    recons_mix_dir = os.path.join(recons_output_dir, 'mix')
+    os.makedirs(recons_mix_dir, exist_ok=True)
+    tracks = []
+    for npy in stage1_output_set:
+        codec_result = np.load(npy)
+        decodec_rlt=[]
+        with torch.no_grad():
+            decoded_waveform = codec_model.decode(torch.as_tensor(codec_result.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(device))
+        decoded_waveform = decoded_waveform.cpu().squeeze(0)
+        decodec_rlt.append(torch.as_tensor(decoded_waveform))
+        decodec_rlt = torch.cat(decodec_rlt, dim=-1)
+        save_path = os.path.join(recons_output_dir, os.path.splitext(os.path.basename(npy))[0] + ".mp3")
+        tracks.append(save_path)
+        save_audio(decodec_rlt, save_path, 16000)
+    # mix tracks
+    for inst_path in tracks:
+        try:
+            if (inst_path.endswith('.wav') or inst_path.endswith('.mp3')) \
+                and 'instrumental' in inst_path:
+                # find pair
+                vocal_path = inst_path.replace('instrumental', 'vocal')
+                if not os.path.exists(vocal_path):
+                    continue
+                # mix
+                recons_mix = os.path.join(recons_mix_dir, os.path.basename(inst_path).replace('instrumental', 'mixed'))
+                vocal_stem, sr = sf.read(inst_path)
+                instrumental_stem, _ = sf.read(vocal_path)
+                mix_stem = (vocal_stem + instrumental_stem) / 1
+                sf.write(recons_mix, mix_stem, sr)
+        except Exception as e:
+            print(e)
+    # vocoder to upsample audios
+    vocal_decoder, inst_decoder = build_codec_model(config_path, vocal_decoder_path, inst_decoder_path)
+    vocoder_output_dir = os.path.join(output_dir, 'vocoder')
+    vocoder_stems_dir = os.path.join(vocoder_output_dir, 'stems')
+    vocoder_mix_dir = os.path.join(vocoder_output_dir, 'mix')
+    os.makedirs(vocoder_mix_dir, exist_ok=True)
+    os.makedirs(vocoder_stems_dir, exist_ok=True)
+    instrumental_output = None
+    vocal_output = None
+    for npy in stage1_output_set:
+        if 'instrumental' in npy:
+            # Process instrumental
+            instrumental_output = process_audio(
+                npy,
+                os.path.join(vocoder_stems_dir, 'instrumental.mp3'),
+                rescale,
+                argparse.Namespace(**locals()), # Convert local variables to argparse.Namespace
+                inst_decoder,
+                codec_model
+            )
+        else:
+            # Process vocal
+            vocal_output = process_audio(
+                npy,
+                os.path.join(vocoder_stems_dir, 'vocal.mp3'),
+                rescale,
+                 argparse.Namespace(**locals()), # Convert local variables to argparse.Namespace
+                vocal_decoder,
+                codec_model
+            )
+    # mix tracks
+    try:
+        mix_output = instrumental_output + vocal_output
+        vocoder_mix = os.path.join(vocoder_mix_dir, os.path.basename(recons_mix))
+        save_audio(mix_output, vocoder_mix, 44100, rescale)
+        print(f"Created mix: {vocoder_mix}")
+    except RuntimeError as e:
+        print(e)
+        print(f"mix {vocoder_mix} failed! inst: {instrumental_output.shape}, vocal: {vocal_output.shape}")
+    # Post process
+    replace_low_freq_with_energy_matched(
+        a_file=recons_mix,     # 16kHz
+        b_file=vocoder_mix,     # 48kHz
+        c_file=os.path.join(output_dir, os.path.basename(recons_mix)),
+        cutoff_freq=5500.0
+    )
+    print("All process Done")
+    return recons_mix
 @spaces.GPU(duration=120)
+def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=200):
+    # Ensure the output folder exists
+    output_dir = "./output"
+    os.makedirs(output_dir, exist_ok=True)
+    print(f"Output folder ensured at: {output_dir}")
+    empty_output_folder(output_dir)
+    # Execute the command
+    try:
+        music = generate_music(stage1_model=model, genre_txt=genre_txt_content, lyrics_txt=lyrics_txt_content, run_n_segments=num_segments, output_dir=output_dir, cuda_idx=0, max_new_tokens=max_new_tokens)
+        return music
+    except subprocess.CalledProcessError as e:
+        print(f"Error occurred: {e}")
+        return None
+    finally:
+        # Clean up temporary files
+        print("Temporary files deleted.")
+# Gradio
+with gr.Blocks() as demo:
+    with gr.Column():
+        gr.Markdown("# YuE: Open Music Foundation Models for Full-Song Generation")
+        gr.HTML("""
+        <div style="display:flex;column-gap:4px;">
+            <a href="https://github.com/multimodal-art-projection/YuE">
+                <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
+            </a>
+            <a href="https://map-yue.github.io">
+                <img src='https://img.shields.io/badge/Project-Page-green'>
+            </a>
+            <a href="https://huggingface.co/spaces/innova-ai/YuE-music-generator-demo?duplicate=true">
+                <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-sm.svg" alt="Duplicate this Space">
+            </a>
+        </div>
+        """)
+        with gr.Row():
+            with gr.Column():
+                genre_txt = gr.Textbox(label="Genre")
+                lyrics_txt = gr.Textbox(label="Lyrics")
+            with gr.Column():
+                if is_shared_ui:
+                    num_segments = gr.Number(label="Number of Segments", value=2, interactive=True)
+                    max_new_tokens = gr.Slider(label="Max New Tokens", info="100 tokens equals 1 second long music", minimum=100, maximum="3000", step=100, value=500, interactive=True) # increase it after testing
+                else:
+                    num_segments = gr.Number(label="Number of Song Segments", value=2, interactive=True)
+                    max_new_tokens = gr.Slider(label="Max New Tokens", minimum=500, maximum="24000", step=500, value=3000, interactive=True)
+                submit_btn = gr.Button("Submit")
+                music_out = gr.Audio(label="Audio Result")
+        gr.Examples(
+            examples = [
+                [
+                    "female blues airy vocal bright vocal piano sad romantic guitar jazz",
+                    """[verse]
+In the quiet of the evening, shadows start to fall
+Whispers of the night wind echo through the hall
+Lost within the silence, I hear your gentle voice
+Guiding me back homeward, making my heart rejoice
+[chorus]
+Don't let this moment fade, hold me close tonight
+With you here beside me, everything's alright
+Can't imagine life alone, don't want to let you go
+Stay with me forever, let our love just flow
+                    """
+                ],
+                [
+                    "rap piano street tough piercing vocal hip-hop synthesizer clear vocal male",
+                    """[verse]
 Woke up in the morning, sun is shining bright
 Chasing all my dreams, gotta get my mind right
 City lights are fading, but my vision's clear
 This is my life, and I'm aiming for the top
 Never gonna quit, no, I'm never gonna stop
 Through the highs and lows, I'mma keep it real
+Living out my dreams with this mic and a deal
+                    """
+                ]
+            ],
+             inputs = [genre_txt, lyrics_txt],
+            outputs = [music_out],
+            cache_examples = False,
+            # cache_mode="lazy",
+            fn=infer
+        )
     submit_btn.click(
+        fn = infer,
+        inputs = [genre_txt, lyrics_txt, num_segments, max_new_tokens],
+        outputs = [music_out]
     )
+demo.queue().launch(show_api=False, show_error=True)