YuE-music-generator-demo-zero

Paused

App Files Files Community

KingNish commited on Jan 29

Commit

9df60ba

verified ·

1 Parent(s): b1860c5

Update app.py

Browse files

Files changed (1) hide show

app.py +289 -317

app.py CHANGED Viewed

@@ -9,69 +9,66 @@ import torch
 from huggingface_hub import snapshot_download
 import uuid
 import time
-import copy
-from collections import Counter
-import re
-import numpy as np
 import torchaudio
-import soundfile as sf
 from torchaudio.transforms import Resample
-from einops import rearrange
-from tqdm import tqdm
 from omegaconf import OmegaConf
-import spaces
-# --- Constants and Environment Setup ---
 IS_SHARED_UI = "innova-ai/YuE-music-generator-demo" in os.environ.get('SPACE_ID', '')
 OUTPUT_DIR = "./output"
-XCODEC_FOLDER = "./xcodec_mini_infer"
-MM_TOKENIZER_PATH = "./mm_tokenizer_v0.2_hf/tokenizer.model"
-STAGE1_MODEL_NAME = "m-a-p/YuE-s1-7B-anneal-en-cot"
-# --- Utility Functions ---
 def install_flash_attn():
-    """Installs flash-attn using pip."""
     try:
         print("Installing flash-attn...")
         subprocess.run(
             "pip install flash-attn --no-build-isolation",
             env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
             shell=True,
-            check=True  # Raise an exception if the command fails
         )
         print("flash-attn installed successfully!")
     except subprocess.CalledProcessError as e:
         print(f"Failed to install flash-attn: {e}")
         exit(1)
-def download_xcodec_model(folder_path):
-  """Downloads xcodec model from huggingface hub."""
-  if not os.path.exists(folder_path):
-      os.makedirs(folder_path, exist_ok=True)
-      print(f"Folder created at: {folder_path}")
-  else:
-      print(f"Folder already exists at: {folder_path}")
-  snapshot_download(
-      repo_id = "m-a-p/xcodec_mini_infer",
-      local_dir = folder_path
-  )
-  print(f"Downloaded xcodec model to {folder_path}")
-def change_working_directory(directory):
-    """Changes the working directory."""
-    try:
-        os.chdir(directory)
-        print(f"Changed working directory to: {os.getcwd()}")
-    except FileNotFoundError:
-        print(f"Directory not found: {directory}")
-        exit(1)
 def empty_output_folder(output_dir):
-    """Clears the output directory."""
-    if not os.path.exists(output_dir):
-        return
     for file in os.listdir(output_dir):
         file_path = os.path.join(output_dir, file)
         try:
@@ -82,304 +79,288 @@ def empty_output_folder(output_dir):
         except Exception as e:
             print(f"Error deleting file {file_path}: {e}")
 def create_temp_file(content, prefix, suffix=".txt"):
-    """Creates a temporary file with given content."""
     content = content.strip() + "\n\n"
     content = content.replace("\r\n", "\n").replace("\r", "\n")
-    with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=prefix, suffix=suffix) as temp_file:
-        temp_file.write(content)
-        temp_file_name = temp_file.name
-    print(f"\nContent written to {prefix}{suffix}:")
-    print(content)
-    print("---")
-    return temp_file_name
 def get_last_mp3_file(output_dir):
-    """Returns the path to the most recently modified .mp3 file in the directory, or None if none exists."""
-    mp3_files = [os.path.join(output_dir, file) for file in os.listdir(output_dir) if file.endswith('.mp3')]
     if not mp3_files:
         print("No .mp3 files found in the output folder.")
         return None
-    return max(mp3_files, key=os.path.getmtime)
 def load_audio_mono(filepath, sampling_rate=16000):
-    """Loads an audio file and converts it to mono at the desired sample rate."""
     audio, sr = torchaudio.load(filepath)
-    audio = torch.mean(audio, dim=0, keepdim=True)  # Convert to mono
     if sr != sampling_rate:
         resampler = Resample(orig_freq=sr, new_freq=sampling_rate)
         audio = resampler(audio)
     return audio
 def split_lyrics(lyrics: str):
-    """Splits lyrics into segments based on the [section] tags."""
     pattern = r"\[(\w+)\](.*?)\n(?=\[|\Z)"
     segments = re.findall(pattern, lyrics, re.DOTALL)
-    return [f"[{seg[0]}]\n{seg[1].strip()}\n\n" for seg in segments]
 def save_audio(wav: torch.Tensor, path, sample_rate: int, rescale: bool = False):
-    """Saves a torch audio tensor to a file."""
-    os.makedirs(os.path.dirname(path), exist_ok=True)
     limit = 0.99
     max_val = wav.abs().max()
     wav = wav * min(limit / max_val, 1) if rescale else wav.clamp(-limit, limit)
     torchaudio.save(str(path), wav, sample_rate=sample_rate, encoding='PCM_S', bits_per_sample=16)
-# --- Model Initialization ---
-def initialize_models(device):
-    """Initializes and loads all required models."""
-    print(f"Using device: {device}")
-    # Load Stage 1 Model
-    stage1_model = AutoModelForCausalLM.from_pretrained(
-        STAGE1_MODEL_NAME,
-        torch_dtype=torch.float16,
-        attn_implementation="flash_attention_2",
-    ).to(device).eval()
-    # Load Tokenizer
-    mmtokenizer = _MMSentencePieceTokenizer(MM_TOKENIZER_PATH)
-     # Load Codec Model
-    sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer'))
-    sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer', 'descriptaudiocodec'))
-    from codecmanipulator import CodecManipulator
-    from models.soundstream_hubert_new import SoundStream
-    codectool = CodecManipulator("xcodec", 0, 1)
-    basic_model_config=os.path.join(XCODEC_FOLDER, "final_ckpt", "config.yaml")
-    resume_path=os.path.join(XCODEC_FOLDER, "final_ckpt", "ckpt_00360000.pth")
-    model_config = OmegaConf.load(basic_model_config)
-    codec_model = eval(model_config.generator.name)(**model_config.generator.config).to(device)
-    parameter_dict = torch.load(resume_path, map_location='cpu')
-    codec_model.load_state_dict(parameter_dict['codec_model'])
-    codec_model.to(device).eval()
-    return stage1_model, mmtokenizer, codectool, codec_model
-# --- Logits Processor ---
-class BlockTokenRangeProcessor(LogitsProcessor):
-    def __init__(self, start_id, end_id):
-        self.blocked_token_ids = list(range(start_id, end_id))
-    def __call__(self, input_ids, scores):
-        scores[:, self.blocked_token_ids] = -float("inf")
-        return scores
-# --- Music Generation Core Function ---
-@spaces.GPU(duration=120)
-def generate_music(
-    stage1_model,
-    mmtokenizer,
-    codectool,
-    codec_model,
-    max_new_tokens=3000,
-    run_n_segments=2,
-    genre_txt=None,
-    lyrics_txt=None,
-    use_audio_prompt=False,
-    audio_prompt_path="",
-    prompt_start_time=0.0,
-    prompt_end_time=30.0,
-    output_dir=OUTPUT_DIR,
-    keep_intermediate=False,
-    disable_offload_model=False,
-    cuda_idx=0,
-    rescale=False,
-):
-    if use_audio_prompt and not audio_prompt_path:
-        raise FileNotFoundError("Please offer audio prompt filepath using '--audio_prompt_path', when you enable 'use_audio_prompt'!")
-    stage1_output_dir = os.path.join(output_dir, f"stage1")
-    os.makedirs(stage1_output_dir, exist_ok=True)
-    device = torch.device(f"cuda:{cuda_idx}" if torch.cuda.is_available() else "cpu")
-    print(f"Using device: {device}")
-    # Load Model Parameters for decoding
-    class BlockTokenRangeProcessor(LogitsProcessor):
-        def __init__(self, start_id, end_id):
-            self.blocked_token_ids = list(range(start_id, end_id))
-        def __call__(self, input_ids, scores):
-            scores[:, self.blocked_token_ids] = -float("inf")
-            return scores
-    # Split lyrics
-    genres = genre_txt.strip()
-    lyrics = split_lyrics(lyrics_txt+"\n")
-    full_lyrics = "\n".join(lyrics)
-    prompt_texts = [f"Generate music from the given lyrics segment by segment.\n[Genre] {genres}\n{full_lyrics}"]
-    prompt_texts += lyrics
-    random_id = uuid.uuid4()
-    output_seq = None
-    top_p = 0.93
-    temperature = 1.0
-    repetition_penalty = 1.2
-    start_of_segment = mmtokenizer.tokenize('[start_of_segment]')
-    end_of_segment = mmtokenizer.tokenize('[end_of_segment]')
-    raw_output = None
-    run_n_segments = min(run_n_segments+1, len(lyrics))
-    stage1_output_set = []
-    print(list(enumerate(tqdm(prompt_texts[:run_n_segments]))))
-    for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
-        section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
-        guidance_scale = 1.5 if i <=1 else 1.2
-        if i==0:
-            continue
-        if i==1:
-            if use_audio_prompt:
-                audio_prompt = load_audio_mono(audio_prompt_path)
-                audio_prompt.unsqueeze_(0)
-                with torch.no_grad():
-                    raw_codes = codec_model.encode(audio_prompt.to(device), target_bw=0.5)
-                raw_codes = raw_codes.transpose(0, 1)
-                raw_codes = raw_codes.cpu().numpy().astype(np.int16)
-                # Format audio prompt
-                code_ids = codectool.npy2ids(raw_codes[0])
-                audio_prompt_codec = code_ids[int(prompt_start_time *50): int(prompt_end_time *50)] # 50 is tps of xcodec
-                audio_prompt_codec_ids = [mmtokenizer.soa] + codectool.sep_ids + audio_prompt_codec + [mmtokenizer.eoa]
-                sentence_ids = mmtokenizer.tokenize("[start_of_reference]") +  audio_prompt_codec_ids + mmtokenizer.tokenize("[end_of_reference]")
-                head_id = mmtokenizer.tokenize(prompt_texts[0]) + sentence_ids
             else:
-                head_id = mmtokenizer.tokenize(prompt_texts[0])
-            prompt_ids = head_id + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
-        else:
-            prompt_ids = end_of_segment + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
-        prompt_ids = torch.as_tensor(prompt_ids).unsqueeze(0).to(device)
-        input_ids = torch.cat([raw_output, prompt_ids], dim=1) if i > 1 else prompt_ids
-        # Use window slicing in case output sequence exceeds the context of model
-        max_context = 16384-max_new_tokens-1
-        if input_ids.shape[-1] > max_context:
-            print(f'Section {i}: output length {input_ids.shape[-1]} exceeding context length {max_context}, now using the last {max_context} tokens.')
-            input_ids = input_ids[:, -(max_context):]
-        with torch.no_grad():
-            output_seq = stage1_model.generate(
-                input_ids=input_ids,
-                max_new_tokens=max_new_tokens,
-                min_new_tokens=100,
-                do_sample=True,
-                top_p=top_p,
-                temperature=temperature,
-                repetition_penalty=repetition_penalty,
-                eos_token_id=mmtokenizer.eoa,
-                pad_token_id=mmtokenizer.eoa,
-                logits_processor=LogitsProcessorList([BlockTokenRangeProcessor(0, 32002), BlockTokenRangeProcessor(32016, 32016)]),
-                guidance_scale=guidance_scale,
                 )
-            if output_seq[0][-1].item() != mmtokenizer.eoa:
-                tensor_eoa = torch.as_tensor([[mmtokenizer.eoa]]).to(stage1_model.device)
-                output_seq = torch.cat((output_seq, tensor_eoa), dim=1)
-        if i > 1:
-            raw_output = torch.cat([raw_output, prompt_ids, output_seq[:, input_ids.shape[-1]:]], dim=1)
-        else:
-            raw_output = output_seq
-        print(len(raw_output))
-    # save raw output and check sanity
-    ids = raw_output[0].cpu().numpy()
-    soa_idx = np.where(ids == mmtokenizer.soa)[0].tolist()
-    eoa_idx = np.where(ids == mmtokenizer.eoa)[0].tolist()
-    if len(soa_idx)!=len(eoa_idx):
-        raise ValueError(f'invalid pairs of soa and eoa, Num of soa: {len(soa_idx)}, Num of eoa: {len(eoa_idx)}')
-    vocals = []
-    instrumentals = []
-    range_begin = 1 if use_audio_prompt else 0
-    for i in range(range_begin, len(soa_idx)):
-        codec_ids = ids[soa_idx[i]+1:eoa_idx[i]]
-        if codec_ids[0] == 32016:
-            codec_ids = codec_ids[1:]
-        codec_ids = codec_ids[:2 * (codec_ids.shape[0] // 2)]
-        vocals_ids = codectool.ids2npy(rearrange(codec_ids,"(n b) -> b n", b=2)[0])
-        vocals.append(vocals_ids)
-        instrumentals_ids = codectool.ids2npy(rearrange(codec_ids,"(n b) -> b n", b=2)[1])
-        instrumentals.append(instrumentals_ids)
-    vocals = np.concatenate(vocals, axis=1)
-    instrumentals = np.concatenate(instrumentals, axis=1)
-    vocal_save_path = os.path.join(stage1_output_dir, f"cot_{genres.replace(' ', '-')}_tp{top_p}_T{temperature}_rp{repetition_penalty}_maxtk{max_new_tokens}_vocal_{random_id}".replace('.', '@')+'.npy')
-    inst_save_path = os.path.join(stage1_output_dir, f"cot_{genres.replace(' ', '-')}_tp{top_p}_T{temperature}_rp{repetition_penalty}_maxtk{max_new_tokens}_instrumental_{random_id}".replace('.', '@')+'.npy')
-    np.save(vocal_save_path, vocals)
-    np.save(inst_save_path, instrumentals)
-    stage1_output_set.append(vocal_save_path)
-    stage1_output_set.append(inst_save_path)
-    # offload model
-    if not disable_offload_model:
-        stage1_model.cpu()
-        del stage1_model
-        torch.cuda.empty_cache()
-    print("Converting to Audio...")
-    # convert audio tokens to audio
-    # reconstruct tracks
-    recons_output_dir = os.path.join(output_dir, "recons")
-    recons_mix_dir = os.path.join(recons_output_dir, 'mix')
-    os.makedirs(recons_mix_dir, exist_ok=True)
-    tracks = []
-    for npy in stage1_output_set:
-        codec_result = np.load(npy)
-        decodec_rlt=[]
-        with torch.no_grad():
-            decoded_waveform = codec_model.decode(torch.as_tensor(codec_result.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(device))
-        decoded_waveform = decoded_waveform.cpu().squeeze(0)
-        decodec_rlt.append(torch.as_tensor(decoded_waveform))
-        decodec_rlt = torch.cat(decodec_rlt, dim=-1)
-        save_path = os.path.join(recons_output_dir, os.path.splitext(os.path.basename(npy))[0] + ".mp3")
-        tracks.append(save_path)
-        save_audio(decodec_rlt, save_path, 16000)
-    # mix tracks
-    for inst_path in tracks:
-        try:
-            if (inst_path.endswith('.wav') or inst_path.endswith('.mp3')) \
-                and 'instrumental' in inst_path:
-                # find pair
-                vocal_path = inst_path.replace('instrumental', 'vocal')
-                if not os.path.exists(vocal_path):
-                    continue
-                # mix
-                recons_mix = os.path.join(recons_mix_dir, os.path.basename(inst_path).replace('instrumental', 'mixed'))
-                vocal_stem, sr = sf.read(inst_path)
-                instrumental_stem, _ = sf.read(vocal_path)
-                mix_stem = (vocal_stem + instrumental_stem) / 1
-                sf.write(recons_mix, mix_stem, sr)
-        except Exception as e:
-            print(e)
-    return recons_mix
 # --- Gradio Interface ---
 @spaces.GPU(duration=120)
 def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=200):
-    """Main function that runs model and returns output audio."""
     os.makedirs(OUTPUT_DIR, exist_ok=True)
     print(f"Output folder ensured at: {OUTPUT_DIR}")
     empty_output_folder(OUTPUT_DIR)
-    device = torch.device(f"cuda" if torch.cuda.is_available() else "cpu")
-    stage1_model, mmtokenizer, codectool, codec_model = initialize_models(device)
     try:
-         music = generate_music(
-            stage1_model=stage1_model,
-            mmtokenizer=mmtokenizer,
-            codectool=codectool,
-            codec_model=codec_model,
-            genre_txt=genre_txt_content,
-            lyrics_txt=lyrics_txt_content,
-            run_n_segments=num_segments,
-            output_dir=OUTPUT_DIR,
-            cuda_idx=0,
             max_new_tokens=max_new_tokens
         )
-         return music
-    except subprocess.CalledProcessError as e:
-        print(f"Error occurred: {e}")
         return None
     finally:
         print("Temporary files deleted.")
 with gr.Blocks() as demo:
     with gr.Column():
         gr.Markdown("# YuE: Open Music Foundation Models for Full-Song Generation")
@@ -387,7 +368,7 @@ with gr.Blocks() as demo:
         <div style="display:flex;column-gap:4px;">
             <a href="https://github.com/multimodal-art-projection/YuE">
                 <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
-            </a>
             <a href="https://map-yue.github.io">
                 <img src='https://img.shields.io/badge/Project-Page-green'>
             </a>
@@ -400,11 +381,11 @@ with gr.Blocks() as demo:
             with gr.Column():
                 genre_txt = gr.Textbox(label="Genre")
                 lyrics_txt = gr.Textbox(label="Lyrics")
             with gr.Column():
                 if IS_SHARED_UI:
                     num_segments = gr.Number(label="Number of Segments", value=2, interactive=True)
-                    max_new_tokens = gr.Slider(label="Max New Tokens", info="100 tokens equals 1 second of music", minimum=100, maximum="3000", step=100, value=500, interactive=True)
                 else:
                     num_segments = gr.Number(label="Number of Song Segments", value=2, interactive=True)
                     max_new_tokens = gr.Slider(label="Max New Tokens", minimum=500, maximum="24000", step=500, value=3000, interactive=True)
@@ -412,7 +393,7 @@ with gr.Blocks() as demo:
                 music_out = gr.Audio(label="Audio Result")
         gr.Examples(
-            examples = [
                 [
                     "female blues airy vocal bright vocal piano sad romantic guitar jazz",
                     """[verse]
@@ -447,26 +428,17 @@ Through the highs and lows, I'mma keep it real
 Living out my dreams with this mic and a deal
                     """
                 ]
-            ],
-             inputs = [genre_txt, lyrics_txt],
-            outputs = [music_out],
-            cache_examples = False,
             fn=infer
         )
     submit_btn.click(
-        fn = infer,
-        inputs = [genre_txt, lyrics_txt, num_segments, max_new_tokens],
-        outputs = [music_out]
     )
-# --- Initialization and Execution ---
-if __name__ == "__main__":
-    # Install Flash Attention
-    install_flash_attn()
-    # Download xcodec mini infer
-    download_xcodec_model(XCODEC_FOLDER)
-    # Change to inference working directory
-    change_working_directory(".")
-    demo.queue().launch(show_api=False, show_error=True)

 from huggingface_hub import snapshot_download
 import uuid
 import time
+from tqdm import tqdm
+from einops import rearrange
 import torchaudio
 from torchaudio.transforms import Resample
+import soundfile as sf
 from omegaconf import OmegaConf
+import numpy as np
+import re
+import sys
+from collections import Counter
+# --- Constants and Setup ---
 IS_SHARED_UI = "innova-ai/YuE-music-generator-demo" in os.environ.get('SPACE_ID', '')
 OUTPUT_DIR = "./output"
+XCODEC_MINI_INFER_DIR = "./xcodec_mini_infer"
+MODEL_ID = "m-a-p/YuE-s1-7B-anneal-en-cot"
+# Install flash-attn
 def install_flash_attn():
     try:
         print("Installing flash-attn...")
+        # Install flash attention
         subprocess.run(
             "pip install flash-attn --no-build-isolation",
             env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
             shell=True,
+            check=True  # Use check=True to raise an exception on failure
         )
         print("flash-attn installed successfully!")
     except subprocess.CalledProcessError as e:
         print(f"Failed to install flash-attn: {e}")
         exit(1)
+install_flash_attn()
+# --- Utility Functions ---
+def download_xcodec_resources():
+    """Downloads xcodec inference files."""
+    if not os.path.exists(XCODEC_MINI_INFER_DIR):
+        os.makedirs(XCODEC_MINI_INFER_DIR, exist_ok=True)
+        print(f"Created folder at: {XCODEC_MINI_INFER_DIR}")
+        snapshot_download(repo_id="m-a-p/xcodec_mini_infer", local_dir=XCODEC_MINI_INFER_DIR)
+    else:
+        print(f"Folder already exists at: {XCODEC_MINI_INFER_DIR}")
+download_xcodec_resources()
+# Add xcodec paths
+sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer'))
+sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer', 'descriptaudiocodec'))
+from codecmanipulator import CodecManipulator
+from mmtokenizer import _MMSentencePieceTokenizer
+from models.soundstream_hubert_new import SoundStream
+from vocoder import build_codec_model, process_audio
+from post_process_audio import replace_low_freq_with_energy_matched
 def empty_output_folder(output_dir):
+    """Empties the output folder."""
     for file in os.listdir(output_dir):
         file_path = os.path.join(output_dir, file)
         try:
         except Exception as e:
             print(f"Error deleting file {file_path}: {e}")
 def create_temp_file(content, prefix, suffix=".txt"):
+    """Creates a temporary file with content."""
+    temp_file = tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=prefix, suffix=suffix)
     content = content.strip() + "\n\n"
     content = content.replace("\r\n", "\n").replace("\r", "\n")
+    temp_file.write(content)
+    temp_file.close()
+    print(f"\nContent written to {prefix}{suffix}:\n{content}\n---")
+    return temp_file.name
 def get_last_mp3_file(output_dir):
+    """Gets the most recently modified MP3 file in a directory."""
+    mp3_files = [file for file in os.listdir(output_dir) if file.endswith('.mp3')]
     if not mp3_files:
         print("No .mp3 files found in the output folder.")
         return None
+    mp3_files_with_path = [os.path.join(output_dir, file) for file in mp3_files]
+    mp3_files_with_path.sort(key=os.path.getmtime, reverse=True)
+    return mp3_files_with_path[0]
+class BlockTokenRangeProcessor(LogitsProcessor):
+    def __init__(self, start_id, end_id):
+        self.blocked_token_ids = list(range(start_id, end_id))
+    def __call__(self, input_ids, scores):
+        scores[:, self.blocked_token_ids] = -float("inf")
+        return scores
 def load_audio_mono(filepath, sampling_rate=16000):
+    """Loads an audio file and converts to mono, optionally resamples."""
     audio, sr = torchaudio.load(filepath)
+    audio = torch.mean(audio, dim=0, keepdim=True)
     if sr != sampling_rate:
         resampler = Resample(orig_freq=sr, new_freq=sampling_rate)
         audio = resampler(audio)
     return audio
 def split_lyrics(lyrics: str):
+    """Splits lyrics into segments based on bracketed headers."""
     pattern = r"\[(\w+)\](.*?)\n(?=\[|\Z)"
     segments = re.findall(pattern, lyrics, re.DOTALL)
+    structured_lyrics = [f"[{seg[0]}]\n{seg[1].strip()}\n\n" for seg in segments]
+    return structured_lyrics
 def save_audio(wav: torch.Tensor, path, sample_rate: int, rescale: bool = False):
+    """Saves an audio tensor to disk."""
+    folder_path = os.path.dirname(path)
+    if not os.path.exists(folder_path):
+        os.makedirs(folder_path)
     limit = 0.99
     max_val = wav.abs().max()
     wav = wav * min(limit / max_val, 1) if rescale else wav.clamp(-limit, limit)
     torchaudio.save(str(path), wav, sample_rate=sample_rate, encoding='PCM_S', bits_per_sample=16)
+# --- Music Generation Class ---
+class MusicGenerator:
+    def __init__(self, device="cuda:0", basic_model_config=f'{XCODEC_MINI_INFER_DIR}/final_ckpt/config.yaml', resume_path=f'{XCODEC_MINI_INFER_DIR}/final_ckpt/ckpt_00360000.pth'):
+        self.device = torch.device(device if torch.cuda.is_available() else "cpu")
+        self.mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
+        self.codectool = CodecManipulator("xcodec", 0, 1)
+        model_config = OmegaConf.load(basic_model_config)
+        self.codec_model = eval(model_config.generator.name)(**model_config.generator.config).to(self.device)
+        parameter_dict = torch.load(resume_path, map_location='cpu')
+        self.codec_model.load_state_dict(parameter_dict['codec_model'])
+        self.codec_model.to(self.device)
+        self.codec_model.eval()
+        # load stage1 model to GPU at initial time
+        self.stage1_model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID,
+            torch_dtype=torch.float16,
+            attn_implementation="flash_attention_2",
+            ).to(self.device)
+        self.stage1_model.eval()
+    def generate(
+        self,
+        genre_txt=None,
+        lyrics_txt=None,
+        max_new_tokens=3000,
+        run_n_segments=2,
+        use_audio_prompt=False,
+        audio_prompt_path="",
+        prompt_start_time=0.0,
+        prompt_end_time=30.0,
+        output_dir=OUTPUT_DIR,
+        keep_intermediate=False,
+        disable_offload_model=False,
+        rescale=False
+    ):
+        if use_audio_prompt and not audio_prompt_path:
+            raise FileNotFoundError("Please offer audio prompt filepath using '--audio_prompt_path', when you enable 'use_audio_prompt'!")
+        stage1_output_dir = os.path.join(output_dir, f"stage1")
+        os.makedirs(stage1_output_dir, exist_ok=True)
+        stage1_output_set = []
+        genres = genre_txt.strip()
+        lyrics = split_lyrics(lyrics_txt + "\n")
+        full_lyrics = "\n".join(lyrics)
+        prompt_texts = [f"Generate music from the given lyrics segment by segment.\n[Genre] {genres}\n{full_lyrics}"]
+        prompt_texts += lyrics
+        random_id = uuid.uuid4()
+        output_seq = None
+        top_p = 0.93
+        temperature = 1.0
+        repetition_penalty = 1.2
+        start_of_segment = self.mmtokenizer.tokenize('[start_of_segment]')
+        end_of_segment = self.mmtokenizer.tokenize('[end_of_segment]')
+        raw_output = None
+        run_n_segments = min(run_n_segments + 1, len(lyrics))
+        print(list(enumerate(tqdm(prompt_texts[:run_n_segments]))))
+        for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
+            section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
+            guidance_scale = 1.5 if i <= 1 else 1.2
+            if i == 0:
+                continue
+            if i == 1:
+                if use_audio_prompt:
+                    audio_prompt = load_audio_mono(audio_prompt_path)
+                    audio_prompt.unsqueeze_(0)
+                    with torch.no_grad():
+                        raw_codes = self.codec_model.encode(audio_prompt.to(self.device), target_bw=0.5)
+                    raw_codes = raw_codes.transpose(0, 1)
+                    raw_codes = raw_codes.cpu().numpy().astype(np.int16)
+                    code_ids = self.codectool.npy2ids(raw_codes[0])
+                    audio_prompt_codec = code_ids[int(prompt_start_time * 50): int(prompt_end_time * 50)]
+                    audio_prompt_codec_ids = [self.mmtokenizer.soa] + self.codectool.sep_ids + audio_prompt_codec + [self.mmtokenizer.eoa]
+                    sentence_ids = self.mmtokenizer.tokenize("[start_of_reference]") + audio_prompt_codec_ids + self.mmtokenizer.tokenize(
+                        "[end_of_reference]")
+                    head_id = self.mmtokenizer.tokenize(prompt_texts[0]) + sentence_ids
+                else:
+                    head_id = self.mmtokenizer.tokenize(prompt_texts[0])
+                prompt_ids = head_id + start_of_segment + self.mmtokenizer.tokenize(section_text) + [self.mmtokenizer.soa] + self.codectool.sep_ids
             else:
+                prompt_ids = end_of_segment + start_of_segment + self.mmtokenizer.tokenize(section_text) + [self.mmtokenizer.soa] + self.codectool.sep_ids
+            prompt_ids = torch.as_tensor(prompt_ids).unsqueeze(0).to(self.device)
+            input_ids = torch.cat([raw_output, prompt_ids], dim=1) if i > 1 else prompt_ids
+            max_context = 16384 - max_new_tokens - 1
+            if input_ids.shape[-1] > max_context:
+                print(f'Section {i}: output length {input_ids.shape[-1]} exceeding context length {max_context}, now using the last {max_context} tokens.')
+                input_ids = input_ids[:, -(max_context):]
+            with torch.no_grad():
+                output_seq = self.stage1_model.generate(
+                    input_ids=input_ids,
+                    max_new_tokens=max_new_tokens,
+                    min_new_tokens=100,
+                    do_sample=True,
+                    top_p=top_p,
+                    temperature=temperature,
+                    repetition_penalty=repetition_penalty,
+                    eos_token_id=self.mmtokenizer.eoa,
+                    pad_token_id=self.mmtokenizer.eoa,
+                    logits_processor=LogitsProcessorList([BlockTokenRangeProcessor(0, 32002), BlockTokenRangeProcessor(32016, 32016)]),
+                    guidance_scale=guidance_scale,
                 )
+                if output_seq[0][-1].item() != self.mmtokenizer.eoa:
+                    tensor_eoa = torch.as_tensor([[self.mmtokenizer.eoa]]).to(self.stage1_model.device)
+                    output_seq = torch.cat((output_seq, tensor_eoa), dim=1)
+            if i > 1:
+                raw_output = torch.cat([raw_output, prompt_ids, output_seq[:, input_ids.shape[-1]:]], dim=1)
+            else:
+                raw_output = output_seq
+            print(len(raw_output))
+        ids = raw_output[0].cpu().numpy()
+        soa_idx = np.where(ids == self.mmtokenizer.soa)[0].tolist()
+        eoa_idx = np.where(ids == self.mmtokenizer.eoa)[0].tolist()
+        if len(soa_idx) != len(eoa_idx):
+            raise ValueError(f'invalid pairs of soa and eoa, Num of soa: {len(soa_idx)}, Num of eoa: {len(eoa_idx)}')
+        vocals = []
+        instrumentals = []
+        range_begin = 1 if use_audio_prompt else 0
+        for i in range(range_begin, len(soa_idx)):
+            codec_ids = ids[soa_idx[i] + 1:eoa_idx[i]]
+            if codec_ids[0] == 32016:
+                codec_ids = codec_ids[1:]
+            codec_ids = codec_ids[:2 * (codec_ids.shape[0] // 2)]
+            vocals_ids = self.codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[0])
+            vocals.append(vocals_ids)
+            instrumentals_ids = self.codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[1])
+            instrumentals.append(instrumentals_ids)
+        vocals = np.concatenate(vocals, axis=1)
+        instrumentals = np.concatenate(instrumentals, axis=1)
+        vocal_save_path = os.path.join(stage1_output_dir,
+                                       f"cot_{genres.replace(' ', '-')}_tp{top_p}_T{temperature}_rp{repetition_penalty}_maxtk{max_new_tokens}_vocal_{random_id}".replace(
+                                           '.', '@') + '.npy')
+        inst_save_path = os.path.join(stage1_output_dir,
+                                       f"cot_{genres.replace(' ', '-')}_tp{top_p}_T{temperature}_rp{repetition_penalty}_maxtk{max_new_tokens}_instrumental_{random_id}".replace(
+                                           '.', '@') + '.npy')
+        np.save(vocal_save_path, vocals)
+        np.save(inst_save_path, instrumentals)
+        stage1_output_set.append(vocal_save_path)
+        stage1_output_set.append(inst_save_path)
+        print("Converting to Audio...")
+        recons_output_dir = os.path.join(output_dir, "recons")
+        recons_mix_dir = os.path.join(recons_output_dir, 'mix')
+        os.makedirs(recons_mix_dir, exist_ok=True)
+        tracks = []
+        for npy in stage1_output_set:
+            codec_result = np.load(npy)
+            decodec_rlt = []
+            with torch.no_grad():
+                decoded_waveform = self.codec_model.decode(
+                    torch.as_tensor(codec_result.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(self.device))
+            decoded_waveform = decoded_waveform.cpu().squeeze(0)
+            decodec_rlt.append(torch.as_tensor(decoded_waveform))
+            decodec_rlt = torch.cat(decodec_rlt, dim=-1)
+            save_path = os.path.join(recons_output_dir, os.path.splitext(os.path.basename(npy))[0] + ".mp3")
+            tracks.append(save_path)
+            save_audio(decodec_rlt, save_path, 16000)
+        for inst_path in tracks:
+            try:
+                if (inst_path.endswith('.wav') or inst_path.endswith('.mp3')) \
+                        and 'instrumental' in inst_path:
+                    vocal_path = inst_path.replace('instrumental', 'vocal')
+                    if not os.path.exists(vocal_path):
+                        continue
+                    recons_mix = os.path.join(recons_mix_dir, os.path.basename(inst_path).replace('instrumental', 'mixed'))
+                    vocal_stem, sr = sf.read(inst_path)
+                    instrumental_stem, _ = sf.read(vocal_path)
+                    mix_stem = (vocal_stem + instrumental_stem) / 1
+                    sf.write(recons_mix, mix_stem, sr)
+            except Exception as e:
+                print(e)
+        return recons_mix
 # --- Gradio Interface ---
+music_generator = MusicGenerator() # Initialize the music generator here to keep the model loaded
 @spaces.GPU(duration=120)
 def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=200):
+    """Inference function for the Gradio interface."""
     os.makedirs(OUTPUT_DIR, exist_ok=True)
     print(f"Output folder ensured at: {OUTPUT_DIR}")
     empty_output_folder(OUTPUT_DIR)
     try:
+        music = music_generator.generate(
+            genre_txt=genre_txt_content,
+            lyrics_txt=lyrics_txt_content,
+            run_n_segments=num_segments,
+            output_dir=OUTPUT_DIR,
             max_new_tokens=max_new_tokens
         )
+        return music
+    except Exception as e:
+        print(f"Error occurred during inference: {e}")
         return None
     finally:
         print("Temporary files deleted.")
 with gr.Blocks() as demo:
     with gr.Column():
         gr.Markdown("# YuE: Open Music Foundation Models for Full-Song Generation")
         <div style="display:flex;column-gap:4px;">
             <a href="https://github.com/multimodal-art-projection/YuE">
                 <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
+            </a>
             <a href="https://map-yue.github.io">
                 <img src='https://img.shields.io/badge/Project-Page-green'>
             </a>
             with gr.Column():
                 genre_txt = gr.Textbox(label="Genre")
                 lyrics_txt = gr.Textbox(label="Lyrics")
             with gr.Column():
                 if IS_SHARED_UI:
                     num_segments = gr.Number(label="Number of Segments", value=2, interactive=True)
+                    max_new_tokens = gr.Slider(label="Max New Tokens", info="100 tokens equals 1 second long music", minimum=100, maximum="3000", step=100, value=500, interactive=True)
                 else:
                     num_segments = gr.Number(label="Number of Song Segments", value=2, interactive=True)
                     max_new_tokens = gr.Slider(label="Max New Tokens", minimum=500, maximum="24000", step=500, value=3000, interactive=True)
                 music_out = gr.Audio(label="Audio Result")
         gr.Examples(
+            examples=[
                 [
                     "female blues airy vocal bright vocal piano sad romantic guitar jazz",
                     """[verse]
 Living out my dreams with this mic and a deal
                     """
                 ]
+            ],
+            inputs=[genre_txt, lyrics_txt],
+            outputs=[music_out],
+            cache_examples=False,
+            # cache_mode="lazy", # not enable cache yet
             fn=infer
         )
     submit_btn.click(
+        fn=infer,
+        inputs=[genre_txt, lyrics_txt, num_segments, max_new_tokens],
+        outputs=[music_out]
     )
+demo.queue().launch(show_api=False, show_error=True)