YuE-music-generator-demo-zero

Build error

App Files Files Community

KingNish commited on Jan 29

Commit

22e7225

1 Parent(s): 725074b

modified: app.py

Browse files

Files changed (1) hide show

app.py +249 -179

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
 import subprocess
-import os
 import shutil
 import tempfile
 import spaces
@@ -27,10 +27,10 @@ def install_flash_attn():
 # Install flash-attn
 install_flash_attn()
-from huggingface_hub import snapshot_download
 # Create xcodec_mini_infer folder
-folder_path = './xcodec_mini_infer'
 # Create the folder if it doesn't exist
 if not os.path.exists(folder_path):
@@ -41,15 +41,87 @@ else:
 snapshot_download(
     repo_id = "m-a-p/xcodec_mini_infer",
-    local_dir = "./xcodec_mini_infer"
 )
-# Add xcodec_mini_infer and descriptaudiocodec to sys path
 import sys
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer'))
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer', 'descriptaudiocodec'))
 import argparse
 import numpy as np
 import json
 from omegaconf import OmegaConf
@@ -72,97 +144,93 @@ from vocoder import build_codec_model, process_audio
 from post_process_audio import replace_low_freq_with_energy_matched
 import re
-# --- Arguments and Model Loading from infer.py ---
-parser = argparse.ArgumentParser()
-# Model Configuration:
-parser.add_argument("--stage1_model", type=str, default="m-a-p/YuE-s1-7B-anneal-en-cot", help="The model checkpoint path or identifier for the Stage 1 model.")
-parser.add_argument("--max_new_tokens", type=int, default=3000, help="The maximum number of new tokens to generate in one pass during text generation.")
-parser.add_argument("--run_n_segments", type=int, default=2, help="The number of segments to process during the generation.")
-# Prompt
-parser.add_argument("--genre_txt", type=str, default="", help="The file path to a text file containing genre tags that describe the musical style or characteristics (e.g., instrumental, genre, mood, vocal timbre, vocal gender). This is used as part of the generation prompt.") # Modified: removed required=True and using default=""
-parser.add_argument("--lyrics_txt", type=str, default="", help="The file path to a text file containing the lyrics for the music generation. These lyrics will be processed and split into structured segments to guide the generation process.") # Modified: removed required=True and using default=""
-parser.add_argument("--use_audio_prompt", action="store_true", help="If set, the model will use an audio file as a prompt during generation. The audio file should be specified using --audio_prompt_path.")
-parser.add_argument("--audio_prompt_path", type=str, default="", help="The file path to an audio file to use as a reference prompt when --use_audio_prompt is enabled.")
-parser.add_argument("--prompt_start_time", type=float, default=0.0, help="The start time in seconds to extract the audio prompt from the given audio file.")
-parser.add_argument("--prompt_end_time", type=float, default=30.0, help="The end time in seconds to extract the audio prompt from the given audio file.")
-# Output
-parser.add_argument("--output_dir", type=str, default="./output", help="The directory where generated outputs will be saved.")
-parser.add_argument("--keep_intermediate", action="store_true", help="If set, intermediate outputs will be saved during processing.")
-parser.add_argument("--disable_offload_model", action="store_true", help="If set, the model will not be offloaded from the GPU to CPU after Stage 1 inference.")
-parser.add_argument("--cuda_idx", type=int, default=0)
-# Config for xcodec and upsampler
-parser.add_argument('--basic_model_config', default='./xcodec_mini_infer/final_ckpt/config.yaml', help='YAML files for xcodec configurations.')
-parser.add_argument('--resume_path', default='./xcodec_mini_infer/final_ckpt/ckpt_00360000.pth', help='Path to the xcodec checkpoint.')
-parser.add_argument('--config_path', type=str, default='./xcodec_mini_infer/decoders/config.yaml', help='Path to Vocos config file.')
-parser.add_argument('--vocal_decoder_path', type=str, default='./xcodec_mini_infer/decoders/decoder_131000.pth', help='Path to Vocos decoder weights.')
-parser.add_argument('--inst_decoder_path', type=str, default='./xcodec_mini_infer/decoders/decoder_151000.pth', help='Path to Vocos decoder weights.')
-parser.add_argument('-r', '--rescale', action='store_true', help='Rescale output to avoid clipping.')
-args = parser.parse_args([]) # Modified: Pass empty list to parse_args to avoid command line parsing in Gradio
-if args.use_audio_prompt and not args.audio_prompt_path:
-    raise FileNotFoundError("Please offer audio prompt filepath using '--audio_prompt_path', when you enable 'use_audio_prompt'!")
-model_name = args.stage1_model # Modified: Renamed 'model' to 'model_name' to avoid shadowing the loaded model later
-cuda_idx = args.cuda_idx
-max_new_tokens_config = args.max_new_tokens # Modified: Renamed 'max_new_tokens' to 'max_new_tokens_config' to avoid shadowing the Gradio input
-stage1_output_dir = os.path.join(args.output_dir, f"stage1")
-os.makedirs(stage1_output_dir, exist_ok=True)
-# load tokenizer and model
-device = torch.device(f"cuda:{cuda_idx}" if torch.cuda.is_available() else "cpu")
-# Now you can use `device` to move your tensors or models to the GPU (if available)
-print(f"Using device: {device}")
-mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
-codectool = CodecManipulator("xcodec", 0, 1)
-model_config = OmegaConf.load(args.basic_model_config)
-codec_model = eval(model_config.generator.name)(**model_config.generator.config).to(device)
-parameter_dict = torch.load(args.resume_path, map_location='cpu')
-codec_model.load_state_dict(parameter_dict['codec_model'])
-codec_model.to(device)
-codec_model.eval()
-class BlockTokenRangeProcessor(LogitsProcessor):
-    def __init__(self, start_id, end_id):
-        self.blocked_token_ids = list(range(start_id, end_id))
-    def __call__(self, input_ids, scores):
-        scores[:, self.blocked_token_ids] = -float("inf")
-        return scores
-def load_audio_mono(filepath, sampling_rate=16000):
-    audio, sr = torchaudio.load(filepath)
-    # Convert to mono
-    audio = torch.mean(audio, dim=0, keepdim=True)
-    # Resample if needed
-    if sr != sampling_rate:
-        resampler = Resample(orig_freq=sr, new_freq=sampling_rate)
-        audio = resampler(audio)
-    return audio
-def split_lyrics(lyrics):
-    pattern = r"\[(\w+)\](.*?)\n(?=\[|\Z)"
-    segments = re.findall(pattern, lyrics, re.DOTALL)
-    structured_lyrics = [f"[{seg[0]}]\n{seg[1].strip()}\n\n" for seg in segments]
-    return structured_lyrics
-def generate_music(genres, lyrics_content, num_segments_run, max_new_tokens_run): # Modified: Function to encapsulate generation logic
-    stage1_output_set_local = [] # Modified: Local variable to store output paths
-    lyrics = split_lyrics(lyrics_content)
-    print(len(lyrics))
     # intruction
     full_lyrics = "\n".join(lyrics)
     prompt_texts = [f"Generate music from the given lyrics segment by segment.\n[Genre] {genres}\n{full_lyrics}"]
     prompt_texts += lyrics
     random_id = uuid.uuid4()
     output_seq = None
     # Here is suggested decoding config
     top_p = 0.93
     temperature = 1.0
@@ -174,20 +242,18 @@ def generate_music(genres, lyrics_content, num_segments_run, max_new_tokens_run)
     raw_output = None
     # Format text prompt
-    run_n_segments = min(num_segments_run+1, len(lyrics)) # Modified: Use passed num_segments_run
     print(list(enumerate(tqdm(prompt_texts[:run_n_segments]))))
-    global model # Modified: Declare model as global to use the loaded model in Gradio scope
     for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
         section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
         guidance_scale = 1.5 if i <=1 else 1.2
         if i==0:
             continue
         if i==1:
-            if args.use_audio_prompt:
-                audio_prompt = load_audio_mono(args.audio_prompt_path)
                 audio_prompt.unsqueeze_(0)
                 with torch.no_grad():
                     raw_codes = codec_model.encode(audio_prompt.to(device), target_bw=0.5)
@@ -195,7 +261,7 @@ def generate_music(genres, lyrics_content, num_segments_run, max_new_tokens_run)
                 raw_codes = raw_codes.cpu().numpy().astype(np.int16)
                 # Format audio prompt
                 code_ids = codectool.npy2ids(raw_codes[0])
-                audio_prompt_codec = code_ids[int(args.prompt_start_time *50): int(args.prompt_end_time *50)] # 50 is tps of xcodec
                 audio_prompt_codec_ids = [mmtokenizer.soa] + codectool.sep_ids + audio_prompt_codec + [mmtokenizer.eoa]
                 sentence_ids = mmtokenizer.tokenize("[start_of_reference]") +  audio_prompt_codec_ids + mmtokenizer.tokenize("[end_of_reference]")
                 head_id = mmtokenizer.tokenize(prompt_texts[0]) + sentence_ids
@@ -205,22 +271,22 @@ def generate_music(genres, lyrics_content, num_segments_run, max_new_tokens_run)
         else:
             prompt_ids = end_of_segment + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
-        prompt_ids = torch.as_tensor(prompt_ids).unsqueeze(0).to(device)
         input_ids = torch.cat([raw_output, prompt_ids], dim=1) if i > 1 else prompt_ids
         # Use window slicing in case output sequence exceeds the context of model
-        max_context = 16384-max_new_tokens_config-1 # Modified: Use max_new_tokens_config
         if input_ids.shape[-1] > max_context:
             print(f'Section {i}: output length {input_ids.shape[-1]} exceeding context length {max_context}, now using the last {max_context} tokens.')
             input_ids = input_ids[:, -(max_context):]
         with torch.no_grad():
             output_seq = model.generate(
-                input_ids=input_ids,
-                max_new_tokens=max_new_tokens_run, # Modified: Use max_new_tokens_run
-                min_new_tokens=100,
-                do_sample=True,
                 top_p=top_p,
-                temperature=temperature,
-                repetition_penalty=repetition_penalty,
                 eos_token_id=mmtokenizer.eoa,
                 pad_token_id=mmtokenizer.eoa,
                 logits_processor=LogitsProcessorList([BlockTokenRangeProcessor(0, 32002), BlockTokenRangeProcessor(32016, 32016)]),
@@ -244,7 +310,7 @@ def generate_music(genres, lyrics_content, num_segments_run, max_new_tokens_run)
     vocals = []
     instrumentals = []
-    range_begin = 1 if args.use_audio_prompt else 0
     for i in range(range_begin, len(soa_idx)):
         codec_ids = ids[soa_idx[i]+1:eoa_idx[i]]
         if codec_ids[0] == 32016:
@@ -256,19 +322,19 @@ def generate_music(genres, lyrics_content, num_segments_run, max_new_tokens_run)
         instrumentals.append(instrumentals_ids)
     vocals = np.concatenate(vocals, axis=1)
     instrumentals = np.concatenate(instrumentals, axis=1)
-    vocal_save_path = os.path.join(stage1_output_dir, f"cot_{genres.replace(' ', '-')}_tp{top_p}_T{temperature}_rp{repetition_penalty}_maxtk{max_new_tokens_run}_vocal_{random_id}".replace('.', '@')+'.npy') # Modified: Use max_new_tokens_run in filename
-    inst_save_path = os.path.join(stage1_output_dir, f"cot_{genres.replace(' ', '-')}_tp{top_p}_T{temperature}_rp{repetition_penalty}_maxtk{max_new_tokens_run}_instrumental_{random_id}".replace('.', '@')+'.npy') # Modified: Use max_new_tokens_run in filename
     np.save(vocal_save_path, vocals)
     np.save(inst_save_path, instrumentals)
-    stage1_output_set_local.append(vocal_save_path)
-    stage1_output_set_local.append(inst_save_path)
-    # offload model - Removed offloading for gradio integration to keep model loaded
-    # if not args.disable_offload_model:
-    #     model.cpu()
-    #     del model
-    #     torch.cuda.empty_cache()
     print("Converting to Audio...")
@@ -282,11 +348,11 @@ def generate_music(genres, lyrics_content, num_segments_run, max_new_tokens_run)
         wav = wav * min(limit / max_val, 1) if rescale else wav.clamp(-limit, limit)
         torchaudio.save(str(path), wav, sample_rate=sample_rate, encoding='PCM_S', bits_per_sample=16)
     # reconstruct tracks
-    recons_output_dir = os.path.join(args.output_dir, "recons")
     recons_mix_dir = os.path.join(recons_output_dir, 'mix')
     os.makedirs(recons_mix_dir, exist_ok=True)
     tracks = []
-    for npy in stage1_output_set_local: # Modified: Use stage1_output_set_local
         codec_result = np.load(npy)
         decodec_rlt=[]
         with torch.no_grad():
@@ -316,26 +382,22 @@ def generate_music(genres, lyrics_content, num_segments_run, max_new_tokens_run)
             print(e)
     # vocoder to upsample audios
-    vocal_decoder, inst_decoder = build_codec_model(args.config_path, args.vocal_decoder_path, args.inst_decoder_path)
-    vocoder_output_dir = os.path.join(args.output_dir, 'vocoder')
     vocoder_stems_dir = os.path.join(vocoder_output_dir, 'stems')
     vocoder_mix_dir = os.path.join(vocoder_output_dir, 'mix')
     os.makedirs(vocoder_mix_dir, exist_ok=True)
     os.makedirs(vocoder_stems_dir, exist_ok=True)
-    instrumental_output = None # Initialize outside try block
-    vocal_output = None # Initialize outside try block
-    recons_mix_path = "" # Initialize outside try block
-    for npy in stage1_output_set_local: # Modified: Use stage1_output_set_local
         if 'instrumental' in npy:
             # Process instrumental
             instrumental_output = process_audio(
                 npy,
                 os.path.join(vocoder_stems_dir, 'instrumental.mp3'),
-                args.rescale,
-                args,
                 inst_decoder,
                 codec_model
             )
@@ -344,60 +406,34 @@ def generate_music(genres, lyrics_content, num_segments_run, max_new_tokens_run)
             vocal_output = process_audio(
                 npy,
                 os.path.join(vocoder_stems_dir, 'vocal.mp3'),
-                args.rescale,
-                args,
                 vocal_decoder,
                 codec_model
             )
     # mix tracks
     try:
         mix_output = instrumental_output + vocal_output
-        recons_mix_path_temp = os.path.join(recons_mix_dir, os.path.basename(recons_mix)) # Use recons_mix from previous step
-        save_audio(mix_output, recons_mix_path_temp, 44100, args.rescale)
-        print(f"Created mix: {recons_mix_path_temp}")
-        recons_mix_path = recons_mix_path_temp # Assign to outer scope variable
     except RuntimeError as e:
         print(e)
-        print(f"mix {recons_mix_path} failed! inst: {instrumental_output.shape}, vocal: {vocal_output.shape}")
     # Post process
-    final_output_path = os.path.join(args.output_dir, os.path.basename(recons_mix_path)) # Use recons_mix_path from previous step
     replace_low_freq_with_energy_matched(
-        a_file=recons_mix_path,     # 16kHz # Use recons_mix_path
-        b_file=recons_mix_path_temp,     # 48kHz # Use recons_mix_path_temp
-        c_file=final_output_path,
         cutoff_freq=5500.0
     )
     print("All process Done")
-    return final_output_path # Modified: Return the final output audio path
-# Gradio UI
-model = AutoModelForCausalLM.from_pretrained( # Load model here for Gradio scope
-    "m-a-p/YuE-s1-7B-anneal-en-cot",
-    torch_dtype=torch.float16,
-    attn_implementation="flash_attention_2", # To enable flashattn, you have to install flash-attn
-    ).to(device).eval() # Modified: Load model globally for Gradio to access
-def empty_output_folder(output_dir):
-    # List all files in the output directory
-    files = os.listdir(output_dir)
-    # Iterate over the files and remove them
-    for file in files:
-        file_path = os.path.join(output_dir, file)
-        try:
-            if os.path.isdir(file_path):
-                # If it's a directory, remove it recursively
-                shutil.rmtree(file_path)
-            else:
-                # If it's a file, delete it
-                os.remove(file_path)
-        except Exception as e:
-            print(f"Error deleting file {file_path}: {e}")
 @spaces.GPU(duration=120)
-def infer_gradio(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=200): # Modified: Renamed infer to infer_gradio to avoid conflict
     # Ensure the output folder exists
     output_dir = "./output"
@@ -405,17 +441,51 @@ def infer_gradio(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_
     print(f"Output folder ensured at: {output_dir}")
     empty_output_folder(output_dir)
-    # Call the generation function directly
-    output_audio_path = generate_music(genre_txt_content, lyrics_txt_content, int(num_segments), int(max_new_tokens)) # Modified: Call generate_music and pass num_segments and max_new_tokens as int
-    if output_audio_path and os.path.exists(output_audio_path):
-        print("Generated audio file:", output_audio_path)
-        return output_audio_path
-    else:
-        print("No audio file generated or path is invalid.")
         return None
 with gr.Blocks() as demo:
     with gr.Column():
@@ -424,7 +494,7 @@ with gr.Blocks() as demo:
         <div style="display:flex;column-gap:4px;">
             <a href="https://github.com/multimodal-art-projection/YuE">
                 <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
-            </a>
             <a href="https://map-yue.github.io">
                 <img src='https://img.shields.io/badge/Project-Page-green'>
             </a>
@@ -437,7 +507,7 @@ with gr.Blocks() as demo:
             with gr.Column():
                 genre_txt = gr.Textbox(label="Genre")
                 lyrics_txt = gr.Textbox(label="Lyrics")
             with gr.Column():
                 if is_shared_ui:
                     num_segments = gr.Number(label="Number of Segments", value=2, interactive=True)
@@ -484,16 +554,16 @@ Through the highs and lows, I'mma keep it real
 Living out my dreams with this mic and a deal
                     """
                 ]
-            ],
              inputs = [genre_txt, lyrics_txt],
             outputs = [music_out],
             cache_examples = False,
             # cache_mode="lazy",
-            fn=infer_gradio # Modified: Use infer_gradio
         )
     submit_btn.click(
-        fn = infer_gradio, # Modified: Use infer_gradio
         inputs = [genre_txt, lyrics_txt, num_segments, max_new_tokens],
         outputs = [music_out]
     )

 import gradio as gr
 import subprocess
+import os
 import shutil
 import tempfile
 import spaces
 # Install flash-attn
 install_flash_attn()
+from huggingface_hub import snapshot_download
 # Create xcodec_mini_infer folder
+folder_path = './inference/xcodec_mini_infer'
 # Create the folder if it doesn't exist
 if not os.path.exists(folder_path):
 snapshot_download(
     repo_id = "m-a-p/xcodec_mini_infer",
+    local_dir = "./inference/xcodec_mini_infer"
 )
+# Change to the "inference" directory
+inference_dir = "./inference"
+try:
+    os.chdir(inference_dir)
+    print(f"Changed working directory to: {os.getcwd()}")
+except FileNotFoundError:
+    print(f"Directory not found: {inference_dir}")
+    exit(1)
+def empty_output_folder(output_dir):
+    # List all files in the output directory
+    files = os.listdir(output_dir)
+    # Iterate over the files and remove them
+    for file in files:
+        file_path = os.path.join(output_dir, file)
+        try:
+            if os.path.isdir(file_path):
+                # If it's a directory, remove it recursively
+                shutil.rmtree(file_path)
+            else:
+                # If it's a file, delete it
+                os.remove(file_path)
+        except Exception as e:
+            print(f"Error deleting file {file_path}: {e}")
+# Function to create a temporary file with string content
+def create_temp_file(content, prefix, suffix=".txt"):
+    temp_file = tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=prefix, suffix=suffix)
+    # Ensure content ends with newline and normalize line endings
+    content = content.strip() + "\n\n"  # Add extra newline at end
+    content = content.replace("\r\n", "\n").replace("\r", "\n")
+    temp_file.write(content)
+    temp_file.close()
+    # Debug: Print file contents
+    print(f"\nContent written to {prefix}{suffix}:")
+    print(content)
+    print("---")
+    return temp_file.name
+def get_last_mp3_file(output_dir):
+    # List all files in the output directory
+    files = os.listdir(output_dir)
+    # Filter only .mp3 files
+    mp3_files = [file for file in files if file.endswith('.mp3')]
+    if not mp3_files:
+        print("No .mp3 files found in the output folder.")
+        return None
+    # Get the full path for the mp3 files
+    mp3_files_with_path = [os.path.join(output_dir, file) for file in mp3_files]
+    # Sort the files based on the modification time (most recent first)
+    mp3_files_with_path.sort(key=lambda x: os.path.getmtime(x), reverse=True)
+    # Return the most recent .mp3 file
+    return mp3_files_with_path[0]
+device = torch.device(f"cuda" if torch.cuda.is_available() else "cpu")
+model = AutoModelForCausalLM.from_pretrained(
+    "m-a-p/YuE-s1-7B-anneal-en-cot",
+    torch_dtype=torch.float16,
+    attn_implementation="flash_attention_2", # To enable flashattn, you have to install flash-attn
+    )
+model.to(device)
+model.eval()
+import os
 import sys
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer'))
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer', 'descriptaudiocodec'))
 import argparse
+import torch
 import numpy as np
 import json
 from omegaconf import OmegaConf
 from post_process_audio import replace_low_freq_with_energy_matched
 import re
+def generate_music(
+    stage1_model="m-a-p/YuE-s1-7B-anneal-en-cot",
+    max_new_tokens=3000,
+    run_n_segments=2,
+    genre_txt=None,
+    lyrics_txt=None,
+    use_audio_prompt=False,
+    audio_prompt_path="",
+    prompt_start_time=0.0,
+    prompt_end_time=30.0,
+    output_dir="./output",
+    keep_intermediate=False,
+    disable_offload_model=False,
+    cuda_idx=0,
+    basic_model_config='./xcodec_mini_infer/final_ckpt/config.yaml',
+    resume_path='./xcodec_mini_infer/final_ckpt/ckpt_00360000.pth',
+    config_path='./xcodec_mini_infer/decoders/config.yaml',
+    vocal_decoder_path='./xcodec_mini_infer/decoders/decoder_131000.pth',
+    inst_decoder_path='./xcodec_mini_infer/decoders/decoder_151000.pth',
+    rescale=False,
+):
+    if use_audio_prompt and not audio_prompt_path:
+        raise FileNotFoundError("Please offer audio prompt filepath using '--audio_prompt_path', when you enable 'use_audio_prompt'!")
+    model = stage1_model
+    cuda_idx = cuda_idx
+    max_new_tokens = max_new_tokens
+    stage1_output_dir = os.path.join(output_dir, f"stage1")
+    os.makedirs(stage1_output_dir, exist_ok=True)
+    # load tokenizer and model
+    device = torch.device(f"cuda:{cuda_idx}" if torch.cuda.is_available() else "cpu")
+    # Now you can use `device` to move your tensors or models to the GPU (if available)
+    print(f"Using device: {device}")
+    mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
+    codectool = CodecManipulator("xcodec", 0, 1)
+    model_config = OmegaConf.load(basic_model_config)
+    codec_model = eval(model_config.generator.name)(**model_config.generator.config).to(device)
+    parameter_dict = torch.load(resume_path, map_location='cpu')
+    codec_model.load_state_dict(parameter_dict['codec_model'])
+    codec_model.to(device)
+    codec_model.eval()
+    class BlockTokenRangeProcessor(LogitsProcessor):
+        def __init__(self, start_id, end_id):
+            self.blocked_token_ids = list(range(start_id, end_id))
+        def __call__(self, input_ids, scores):
+            scores[:, self.blocked_token_ids] = -float("inf")
+            return scores
+    def load_audio_mono(filepath, sampling_rate=16000):
+        audio, sr = torchaudio.load(filepath)
+        # Convert to mono
+        audio = torch.mean(audio, dim=0, keepdim=True)
+        # Resample if needed
+        if sr != sampling_rate:
+            resampler = Resample(orig_freq=sr, new_freq=sampling_rate)
+            audio = resampler(audio)
+        return audio
+    def split_lyrics(lyrics):
+        pattern = r"\[(\w+)\](.*?)\n(?=\[|\Z)"
+        segments = re.findall(pattern, lyrics, re.DOTALL)
+        structured_lyrics = [f"[{seg[0]}]\n{seg[1].strip()}\n\n" for seg in segments]
+        return structured_lyrics
+    # Call the function and print the result
+    stage1_output_set = []
+    # Tips:
+    # genre tags support instrumental，genre，mood，vocal timbr and vocal gender
+    # all kinds of tags are needed
+    with open(genre_txt) as f:
+        genres = f.read().strip()
+    with open(lyrics_txt) as f:
+        lyrics = split_lyrics(f.read())
     # intruction
     full_lyrics = "\n".join(lyrics)
     prompt_texts = [f"Generate music from the given lyrics segment by segment.\n[Genre] {genres}\n{full_lyrics}"]
     prompt_texts += lyrics
     random_id = uuid.uuid4()
     output_seq = None
     # Here is suggested decoding config
     top_p = 0.93
     temperature = 1.0
     raw_output = None
     # Format text prompt
+    run_n_segments = min(run_n_segments+1, len(lyrics))
     print(list(enumerate(tqdm(prompt_texts[:run_n_segments]))))
     for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
         section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
         guidance_scale = 1.5 if i <=1 else 1.2
         if i==0:
             continue
         if i==1:
+            if use_audio_prompt:
+                audio_prompt = load_audio_mono(audio_prompt_path)
                 audio_prompt.unsqueeze_(0)
                 with torch.no_grad():
                     raw_codes = codec_model.encode(audio_prompt.to(device), target_bw=0.5)
                 raw_codes = raw_codes.cpu().numpy().astype(np.int16)
                 # Format audio prompt
                 code_ids = codectool.npy2ids(raw_codes[0])
+                audio_prompt_codec = code_ids[int(prompt_start_time *50): int(prompt_end_time *50)] # 50 is tps of xcodec
                 audio_prompt_codec_ids = [mmtokenizer.soa] + codectool.sep_ids + audio_prompt_codec + [mmtokenizer.eoa]
                 sentence_ids = mmtokenizer.tokenize("[start_of_reference]") +  audio_prompt_codec_ids + mmtokenizer.tokenize("[end_of_reference]")
                 head_id = mmtokenizer.tokenize(prompt_texts[0]) + sentence_ids
         else:
             prompt_ids = end_of_segment + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
+        prompt_ids = torch.as_tensor(prompt_ids).unsqueeze(0).to(device)
         input_ids = torch.cat([raw_output, prompt_ids], dim=1) if i > 1 else prompt_ids
         # Use window slicing in case output sequence exceeds the context of model
+        max_context = 16384-max_new_tokens-1
         if input_ids.shape[-1] > max_context:
             print(f'Section {i}: output length {input_ids.shape[-1]} exceeding context length {max_context}, now using the last {max_context} tokens.')
             input_ids = input_ids[:, -(max_context):]
         with torch.no_grad():
             output_seq = model.generate(
+                input_ids=input_ids,
+                max_new_tokens=max_new_tokens,
+                min_new_tokens=100,
+                do_sample=True,
                 top_p=top_p,
+                temperature=temperature,
+                repetition_penalty=repetition_penalty,
                 eos_token_id=mmtokenizer.eoa,
                 pad_token_id=mmtokenizer.eoa,
                 logits_processor=LogitsProcessorList([BlockTokenRangeProcessor(0, 32002), BlockTokenRangeProcessor(32016, 32016)]),
     vocals = []
     instrumentals = []
+    range_begin = 1 if use_audio_prompt else 0
     for i in range(range_begin, len(soa_idx)):
         codec_ids = ids[soa_idx[i]+1:eoa_idx[i]]
         if codec_ids[0] == 32016:
         instrumentals.append(instrumentals_ids)
     vocals = np.concatenate(vocals, axis=1)
     instrumentals = np.concatenate(instrumentals, axis=1)
+    vocal_save_path = os.path.join(stage1_output_dir, f"cot_{genres.replace(' ', '-')}_tp{top_p}_T{temperature}_rp{repetition_penalty}_maxtk{max_new_tokens}_vocal_{random_id}".replace('.', '@')+'.npy')
+    inst_save_path = os.path.join(stage1_output_dir, f"cot_{genres.replace(' ', '-')}_tp{top_p}_T{temperature}_rp{repetition_penalty}_maxtk{max_new_tokens}_instrumental_{random_id}".replace('.', '@')+'.npy')
     np.save(vocal_save_path, vocals)
     np.save(inst_save_path, instrumentals)
+    stage1_output_set.append(vocal_save_path)
+    stage1_output_set.append(inst_save_path)
+    # offload model
+    if not disable_offload_model:
+        model.cpu()
+        del model
+        torch.cuda.empty_cache()
     print("Converting to Audio...")
         wav = wav * min(limit / max_val, 1) if rescale else wav.clamp(-limit, limit)
         torchaudio.save(str(path), wav, sample_rate=sample_rate, encoding='PCM_S', bits_per_sample=16)
     # reconstruct tracks
+    recons_output_dir = os.path.join(output_dir, "recons")
     recons_mix_dir = os.path.join(recons_output_dir, 'mix')
     os.makedirs(recons_mix_dir, exist_ok=True)
     tracks = []
+    for npy in stage1_output_set:
         codec_result = np.load(npy)
         decodec_rlt=[]
         with torch.no_grad():
             print(e)
     # vocoder to upsample audios
+    vocal_decoder, inst_decoder = build_codec_model(config_path, vocal_decoder_path, inst_decoder_path)
+    vocoder_output_dir = os.path.join(output_dir, 'vocoder')
     vocoder_stems_dir = os.path.join(vocoder_output_dir, 'stems')
     vocoder_mix_dir = os.path.join(vocoder_output_dir, 'mix')
     os.makedirs(vocoder_mix_dir, exist_ok=True)
     os.makedirs(vocoder_stems_dir, exist_ok=True)
+    instrumental_output = None
+    vocal_output = None
+    for npy in stage1_output_set:
         if 'instrumental' in npy:
             # Process instrumental
             instrumental_output = process_audio(
                 npy,
                 os.path.join(vocoder_stems_dir, 'instrumental.mp3'),
+                rescale,
+                argparse.Namespace(**locals()), # Convert local variables to argparse.Namespace
                 inst_decoder,
                 codec_model
             )
             vocal_output = process_audio(
                 npy,
                 os.path.join(vocoder_stems_dir, 'vocal.mp3'),
+                rescale,
+                 argparse.Namespace(**locals()), # Convert local variables to argparse.Namespace
                 vocal_decoder,
                 codec_model
             )
     # mix tracks
     try:
         mix_output = instrumental_output + vocal_output
+        vocoder_mix = os.path.join(vocoder_mix_dir, os.path.basename(recons_mix))
+        save_audio(mix_output, vocoder_mix, 44100, rescale)
+        print(f"Created mix: {vocoder_mix}")
+        return vocoder_mix
     except RuntimeError as e:
         print(e)
+        print(f"mix {vocoder_mix} failed! inst: {instrumental_output.shape}, vocal: {vocal_output.shape}")
     # Post process
     replace_low_freq_with_energy_matched(
+        a_file=recons_mix,     # 16kHz
+        b_file=vocoder_mix,     # 48kHz
+        c_file=os.path.join(output_dir, os.path.basename(recons_mix)),
         cutoff_freq=5500.0
     )
     print("All process Done")
 @spaces.GPU(duration=120)
+def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=200):
     # Ensure the output folder exists
     output_dir = "./output"
     print(f"Output folder ensured at: {output_dir}")
     empty_output_folder(output_dir)
+    # Command and arguments with optimized settings
+    command = [
+        "python", "infer.py",
+        "--stage1_model", model,
+        # "--stage2_model", "m-a-p/YuE-s2-1B-general",
+        "--genre_txt", f"{genre_txt_content}",
+        "--lyrics_txt", f"{lyrics_txt_content}",
+        "--run_n_segments", f"{num_segments}",
+        # "--stage2_batch_size", "4",
+        "--output_dir", f"{output_dir}",
+        "--cuda_idx", "0",
+        "--max_new_tokens", f"{max_new_tokens}",
+        # "--disable_offload_model"
+    ]
+    # Execute the command
+    try:
+        music = generate_music(stage1_model=model, genre_txt=genre_txt_content, lyrics_txt=lyrics_txt_content, run_n_segments=num_segments, output_dir=output_dir, cuda_idx=0, max_new_tokens=max_new_tokens)
+        # Check and print the contents of the output folder
+        output_files = os.listdir(output_dir)
+        if output_files:
+            print("Output folder contents:")
+            for file in output_files:
+                print(f"- {file}")
+            last_mp3 = get_last_mp3_file(output_dir)
+            if last_mp3:
+                print("Last .mp3 file:", last_mp3)
+                return last_mp3
+            else:
+                return None
+        else:
+            print("Output folder is empty.")
+            return None
+    except subprocess.CalledProcessError as e:
+        print(f"Error occurred: {e}")
         return None
+    finally:
+        # Clean up temporary files
+        print("Temporary files deleted.")
+# Gradio
 with gr.Blocks() as demo:
     with gr.Column():
         <div style="display:flex;column-gap:4px;">
             <a href="https://github.com/multimodal-art-projection/YuE">
                 <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
+            </a>
             <a href="https://map-yue.github.io">
                 <img src='https://img.shields.io/badge/Project-Page-green'>
             </a>
             with gr.Column():
                 genre_txt = gr.Textbox(label="Genre")
                 lyrics_txt = gr.Textbox(label="Lyrics")
             with gr.Column():
                 if is_shared_ui:
                     num_segments = gr.Number(label="Number of Segments", value=2, interactive=True)
 Living out my dreams with this mic and a deal
                     """
                 ]
+            ],
              inputs = [genre_txt, lyrics_txt],
             outputs = [music_out],
             cache_examples = False,
             # cache_mode="lazy",
+            fn=infer
         )
     submit_btn.click(
+        fn = infer,
         inputs = [genre_txt, lyrics_txt, num_segments, max_new_tokens],
         outputs = [music_out]
     )