YuE-music-generator-demo-zero

Paused

App Files Files Community

KingNish commited on Jan 29

Commit

01bd804

1 Parent(s): 5b10475

using r1

Browse files

Files changed (1) hide show

app.py +188 -248

app.py CHANGED Viewed

@@ -43,291 +43,231 @@ except FileNotFoundError:
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer'))
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer', 'descriptaudiocodec'))
-from transformers import AutoTokenizer, AutoModelForCausalLM, LogitsProcessor, LogitsProcessorList
 import torch
-from huggingface_hub import snapshot_download
-import sys
-import uuid
 import numpy as np
-import json
 from omegaconf import OmegaConf
 import torchaudio
-from torchaudio.transforms import Resample
 import soundfile as sf
-from tqdm import tqdm
-from einops import rearrange
-import time
-from codecmanipulator import CodecManipulator
 from mmtokenizer import _MMSentencePieceTokenizer
-import re
 # Configuration Constants
-MAX_NEW_TOKENS = 3000
-DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-MODEL_NAME = "m-a-p/YuE-s1-7B-anneal-en-cot"
-CODEC_CONFIG_PATH = './xcodec_mini_infer/final_ckpt/config.yaml'
-CODEC_CKPT_PATH = './xcodec_mini_infer/final_ckpt/ckpt_00360000.pth'
-# Global Initialization
-is_shared_ui = "innova-ai/YuE-music-generator-demo" in os.environ.get('SPACE_ID', '')
-# Preload models and components
-def load_models():
-    print("Initializing models...")
-    # Load main model
     model = AutoModelForCausalLM.from_pretrained(
-        MODEL_NAME,
-        torch_dtype=torch.float16,
         attn_implementation="flash_attention_2",
     ).to(DEVICE).eval()
-    return model
-# Preload all models and components
-model = load_models()
-# Audio processing cache
-resampler_cache = {}
-def get_resampler(orig_freq, new_freq):
-    key = (orig_freq, new_freq)
-    if key not in resampler_cache:
-        resampler_cache[key] = Resample(orig_freq=orig_freq, new_freq=new_freq).to(DEVICE)
-    return resampler_cache[key]
-def load_audio_mono(filepath, sampling_rate=16000):
-    audio, sr = torchaudio.load(filepath)
-    audio = torch.mean(audio, dim=0, keepdim=True).to(DEVICE)
-    if sr != sampling_rate:
-        resampler = get_resampler(sr, sampling_rate)
-        audio = resampler(audio)
-    return audio
-@spaces.GPU(duration=120)
-def generate_music(
-    genre_txt=None,
-    lyrics_txt=None,
-    max_new_tokens=100,
-    run_n_segments=2,
-    use_audio_prompt=False,
-    audio_prompt_path="",
-    prompt_start_time=0.0,
-    prompt_end_time=30.0,
-    output_dir="./output",
-    keep_intermediate=False,
-    rescale=False,
-):
-    # Load tokenizer
     mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
-    # Precompute token IDs
-    start_of_segment = mmtokenizer.tokenize('[start_of_segment]')
-    end_of_segment = mmtokenizer.tokenize('[end_of_segment]')
-    # Load codec model
-    model_config = OmegaConf.load(CODEC_CONFIG_PATH)
-    codec_model = eval(model_config.generator.name)(**model_config.generator.config).to(DEVICE)
-    parameter_dict = torch.load(CODEC_CKPT_PATH, map_location='cpu')
-    codec_model.load_state_dict(parameter_dict['codec_model'])
-    codec_model.eval()
-    # Initialize codec tools
     codectool = CodecManipulator("xcodec", 0, 1)
-    # Create output directories once
-    os.makedirs(output_dir, exist_ok=True)
-    stage1_output_dir = os.path.join(output_dir, "stage1")
-    os.makedirs(stage1_output_dir, exist_ok=True)
-    # Process inputs
-    genres = genre_txt.strip()
-    lyrics = split_lyrics(lyrics_txt+"\n")
-    full_lyrics = "\n".join(lyrics)
-    prompt_texts = [f"Generate music from the given lyrics segment by segment.\n[Genre] {genres}\n{full_lyrics}"] + lyrics
-    random_id = uuid.uuid4()
-    # Audio prompt processing
-    audio_prompt_codec_ids = []
-    if use_audio_prompt:
-        if not audio_prompt_path:
-            raise FileNotFoundError("Audio prompt path required when using audio prompt!")
-        audio_prompt = load_audio_mono(audio_prompt_path)
-        with torch.inference_mode():
-            raw_codes = codec_model.encode(audio_prompt.unsqueeze(0), target_bw=0.5)
-            raw_codes = raw_codes.transpose(0, 1).cpu().numpy().astype(np.int16)
-        code_ids = codectool.npy2ids(raw_codes[0])
-        audio_prompt_codec = code_ids[int(prompt_start_time*50):int(prompt_end_time*50)]
-        audio_prompt_codec_ids = [mmtokenizer.soa] + codectool.sep_ids + audio_prompt_codec + [mmtokenizer.eoa]
-    # Generation loop optimization
-    run_n_segments = min(run_n_segments+1, len(lyrics))
-    output_seq = None
-    with torch.inference_mode():
-        for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
-            if i == 0: continue  # Skip system prompt
-            # Prepare prompt
-            section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
-            guidance_scale = 1.5 if i <= 1 else 1.2
-            if i == 1:
-                prompt_ids = mmtokenizer.tokenize(prompt_texts[0])
-                if use_audio_prompt:
-                    prompt_ids += mmtokenizer.tokenize("[start_of_reference]") + audio_prompt_codec_ids + mmtokenizer.tokenize("[end_of_reference]")
-                prompt_ids += start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
-            else:
-                prompt_ids = end_of_segment + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
-            # Process input sequence
-            prompt_ids = torch.tensor(prompt_ids, device=DEVICE).unsqueeze(0)
-            input_ids = torch.cat([output_seq, prompt_ids], dim=1) if i > 1 else prompt_ids
-            # Generate sequence
-            output_seq = model.generate(
-                input_ids=input_ids,
-                max_new_tokens=max_new_tokens,
-                min_new_tokens=100,
-                do_sample=True,
-                top_p=0.93,
-                temperature=1.0,
-                repetition_penalty=1.2,
-                eos_token_id=mmtokenizer.eoa,
-                pad_token_id=mmtokenizer.eoa,
-                logits_processor=LogitsProcessorList([
-                    BlockTokenRangeProcessor(0, 32002),
-                    BlockTokenRangeProcessor(32016, 32016)
-                ]),
-                guidance_scale=guidance_scale,
-            )
-    # Post-processing optimization
-    ids = output_seq[0].cpu().numpy()
-    soa_idx = np.where(ids == mmtokenizer.soa)[0]
-    eoa_idx = np.where(ids == mmtokenizer.eoa)[0]
-    # Vectorized audio processing
-    vocals, instrumentals = process_audio_segments(ids, soa_idx, eoa_idx, codectool)
-    # Save and mix audio
-    return save_and_mix_audio(vocals, instrumentals, genres, random_id, output_dir)
-def process_audio_segments(ids, soa_idx, eoa_idx, codectool):
-    vocals, instrumentals = [], []
-    range_begin = 1 if len(soa_idx) > len(eoa_idx) else 0
-    for i in range(range_begin, len(soa_idx)):
-        codec_ids = ids[soa_idx[i]+1:eoa_idx[i]]
-        codec_ids = codec_ids[:2 * (len(codec_ids) // 2)]
-        # Vectorized processing
-        arr = rearrange(codec_ids, "(n b) -> b n", b=2)
-        vocals.append(codectool.ids2npy(arr[0]))
-        instrumentals.append(codectool.ids2npy(arr[1]))
-    return np.concatenate(vocals, axis=1), np.concatenate(instrumentals, axis=1)
-def save_and_mix_audio(vocals, instrumentals, genres, random_id, output_dir):
-    # Save directly to memory buffers
-    vocal_buf = torch.as_tensor(vocals.astype(np.int16), device=DEVICE)
-    inst_buf = torch.as_tensor(instrumentals.astype(np.int16), device=DEVICE)
-    with torch.inference_mode():
-        vocal_wav = codec_model.decode(vocal_buf.unsqueeze(0).permute(1, 0, 2))
-        inst_wav = codec_model.decode(inst_buf.unsqueeze(0).permute(1, 0, 2))
-    # Mix directly in GPU memory
     mixed = (vocal_wav + inst_wav) / 2
-    mixed = mixed.squeeze(0).cpu().numpy()
-    # Save final output
-    output_path = os.path.join(output_dir, f"mixed_{genres}_{random_id}.mp3")
-    sf.write(output_path, mixed.T, 16000)
-    return output_path
-# Gradio
-with gr.Blocks() as demo:
-    with gr.Column():
-        gr.Markdown("# YuE: Open Music Foundation Models for Full-Song Generation")
-        gr.HTML("""
-        <div style="display:flex;column-gap:4px;">
-            <a href="https://github.com/multimodal-art-projection/YuE">
-                <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
-            </a>
-            <a href="https://map-yue.github.io">
-                <img src='https://img.shields.io/badge/Project-Page-green'>
-            </a>
-            <a href="https://huggingface.co/spaces/innova-ai/YuE-music-generator-demo?duplicate=true">
-                <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-sm.svg" alt="Duplicate this Space">
-            </a>
-        </div>
-        """)
-        with gr.Row():
-            with gr.Column():
-                genre_txt = gr.Textbox(label="Genre")
-                lyrics_txt = gr.Textbox(label="Lyrics")
-            with gr.Column():
-                if is_shared_ui:
-                    num_segments = gr.Number(label="Number of Segments", value=2, interactive=True)
-                    max_new_tokens = gr.Slider(label="Max New Tokens", info="100 tokens equals 1 second long music", minimum=100, maximum="3000", step=100, value=500, interactive=True) # increase it after testing
-                else:
-                    num_segments = gr.Number(label="Number of Song Segments", value=2, interactive=True)
-                    max_new_tokens = gr.Slider(label="Max New Tokens", minimum=500, maximum="24000", step=500, value=3000, interactive=True)
-                submit_btn = gr.Button("Submit")
-                music_out = gr.Audio(label="Audio Result")
-        gr.Examples(
-            examples = [
-                [
-                    "female blues airy vocal bright vocal piano sad romantic guitar jazz",
-                    """[verse]
-In the quiet of the evening, shadows start to fall
-Whispers of the night wind echo through the hall
-Lost within the silence, I hear your gentle voice
-Guiding me back homeward, making my heart rejoice
-[chorus]
-Don't let this moment fade, hold me close tonight
-With you here beside me, everything's alright
-Can't imagine life alone, don't want to let you go
-Stay with me forever, let our love just flow
-                    """
-                ],
-                [
-                    "rap piano street tough piercing vocal hip-hop synthesizer clear vocal male",
-                    """[verse]
-Woke up in the morning, sun is shining bright
-Chasing all my dreams, gotta get my mind right
-City lights are fading, but my vision's clear
-Got my team beside me, no room for fear
-Walking through the streets, beats inside my head
-Every step I take, closer to the bread
-People passing by, they don't understand
-Building up my future with my own two hands
-[chorus]
-This is my life, and I'm aiming for the top
-Never gonna quit, no, I'm never gonna stop
-Through the highs and lows, I'mma keep it real
-Living out my dreams with this mic and a deal
-                    """
-                ]
-            ],
-             inputs = [genre_txt, lyrics_txt],
-            outputs = [music_out],
-            cache_examples = True,
-            cache_mode="eager",
-            fn=generate_music
-        )
     submit_btn.click(
-        fn = generate_music,
-        inputs = [genre_txt, lyrics_txt, num_segments, max_new_tokens],
-        outputs = [music_out]
     )
-demo.queue().launch(show_api=False, show_error=True)

 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer'))
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer', 'descriptaudiocodec'))
+import gradio as gr
+import os
+import shutil
+import tempfile
+import spaces
 import torch
 import numpy as np
+from pathlib import Path
+from huggingface_hub import snapshot_download
 from omegaconf import OmegaConf
 import torchaudio
 import soundfile as sf
+from functools import lru_cache
+from concurrent.futures import ThreadPoolExecutor
+from transformers import AutoTokenizer, AutoModelForCausalLM, LogitsProcessorList
+from models.soundstream_hubert_new import SoundStream
+from vocoder import build_codec_model
 from mmtokenizer import _MMSentencePieceTokenizer
+from codecmanipulator import CodecManipulator
+# --------------------------
 # Configuration Constants
+# --------------------------
+MODEL_DIR = Path("./xcodec_mini_infer")
+OUTPUT_DIR = Path("./output")
+DEVICE = "cuda:0"
+TORCH_DTYPE = torch.float16
+MAX_CONTEXT = 16384 - 3000 - 1
+MAX_SEQ_LEN = 16384
+# --------------------------
+# Preload Models with KV Cache Initialization
+# --------------------------
+@spaces.GPU
+def preload_models():
+    global model, mmtokenizer, codec_model, codectool, vocal_decoder, inst_decoder
+    # Text generation model with KV cache support
     model = AutoModelForCausalLM.from_pretrained(
+        "m-a-p/YuE-s1-7B-anneal-en-cot",
+        torch_dtype=TORCH_DTYPE,
         attn_implementation="flash_attention_2",
+        use_cache=True  # Enable KV caching
     ).to(DEVICE).eval()
+    # Tokenizer and codec tools
     mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
     codectool = CodecManipulator("xcodec", 0, 1)
+    # Audio codec model
+    model_config = OmegaConf.load(MODEL_DIR/"final_ckpt/config.yaml")
+    codec_model = SoundStream(**model_config.generator.config).to(DEVICE)
+    codec_model.load_state_dict(
+        torch.load(MODEL_DIR/"final_ckpt/ckpt_00360000.pth", map_location='cpu')['codec_model']
+    )
+    codec_model.eval()
+    # Vocoders
+    vocal_decoder, inst_decoder = build_codec_model(
+        MODEL_DIR/"decoders/config.yaml",
+        MODEL_DIR/"decoders/decoder_131000.pth",
+        MODEL_DIR/"decoders/decoder_151000.pth"
+    )
+# --------------------------
+# Optimized Generation with KV Cache Management
+# --------------------------
+class KVCacheManager:
+    def __init__(self, model):
+        self.model = model
+        self.past_key_values = None
+        self.current_length = 0
+    def reset(self):
+        self.past_key_values = None
+        self.current_length = 0
+    def generate_with_cache(self, input_ids, generation_config):
+        outputs = self.model(
+            input_ids,
+            past_key_values=self.past_key_values,
+            use_cache=True,
+            output_hidden_states=False,
+            return_dict=True
+        )
+        self.past_key_values = outputs.past_key_values
+        self.current_length += input_ids.shape[1]
+        return outputs.logits
+def split_lyrics(lyrics: str):
+    pattern = r"\[(\w+)\](.*?)\n(?=\[|\Z)"
+    segments = re.findall(pattern, lyrics, re.DOTALL)
+    return [f"[{seg[0]}]\n{seg[1].strip()}\n\n" for seg in segments]
+@torch.inference_mode()
+def process_audio_batch(codec_ids, decoder, sample_rate=44100):
+    decoded = codec_model.decode(
+        torch.as_tensor(codec_ids.astype(np.int16), dtype=torch.long)
+        .unsqueeze(0).permute(1, 0, 2).to(DEVICE)
+    )
+    return decoded.cpu().squeeze(0)
+# --------------------------
+# Core Generation Logic with KV Cache
+# --------------------------
+def generate_music(genre_txt, lyrics_txt, num_segments=2, max_new_tokens=2000):
+    # Initialize KV cache manager
+    cache_manager = KVCacheManager(model)
+    # Preprocess inputs
+    genres = genre_txt.strip()
+    structured_lyrics = split_lyrics(lyrics_txt+"\n")
+    prompt_texts = [f"Generate music from the given lyrics segment by segment.\n[Genre] {genres}\n{''.join(structured_lyrics)}"] + structured_lyrics
+    # Generation loop with KV cache
+    all_generated = []
+    for i in range(1, min(num_segments+1, len(prompt_texts))):
+        input_ids = prepare_inputs(prompt_texts, i, all_generated)
+        input_ids = input_ids.to(DEVICE)
+        # Generate segment with KV cache
+        segment_output = []
+        for _ in range(max_new_tokens):
+            logits = cache_manager.generate_with_cache(input_ids, None)
+            # Sampling logic
+            probs = torch.nn.functional.softmax(logits[:, -1], dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)
+            segment_output.append(next_token.item())
+            input_ids = next_token.unsqueeze(0)
+            if next_token == mmtokenizer.eoa:
+                break
+        all_generated.extend(segment_output)
+        # Prevent cache overflow
+        if cache_manager.current_length > MAX_SEQ_LEN * 0.8:
+            cache_manager.reset()
+    # Process outputs
+    ids = np.array(all_generated)
+    vocals, instrumentals = process_outputs(ids)
+    # Parallel audio processing
+    with ThreadPoolExecutor() as executor:
+        vocal_future = executor.submit(process_audio_batch, vocals, vocal_decoder)
+        inst_future = executor.submit(process_audio_batch, instrumentals, inst_decoder)
+        vocal_wav = vocal_future.result()
+        inst_wav = inst_future.result()
+    # Mix and post-process
     mixed = (vocal_wav + inst_wav) / 2
+    final_path = OUTPUT_DIR/"final_output.mp3"
+    save_audio(mixed, final_path, 44100)
+    return str(final_path)
+# --------------------------
+# Optimized Helper Functions
+# --------------------------
+@lru_cache(maxsize=10)
+def prepare_inputs(prompt_texts, index, previous_tokens):
+    current_prompt = mmtokenizer.tokenize(prompt_texts[index])
+    return torch.tensor([previous_tokens + current_prompt], dtype=torch.long, device=DEVICE)
+def process_outputs(ids):
+    soa_idx = np.where(ids == mmtokenizer.soa)[0].tolist()
+    eoa_idx = np.where(ids == mmtokenizer.eoa)[0].tolist()
+    vocals = []
+    instrumentals = []
+    for i in range(len(soa_idx)):
+        codec_ids = ids[soa_idx[i]+1:eoa_idx[i]]
+        codec_ids = codec_ids[:2 * (codec_ids.shape[0] // 2)]
+        vocals.append(codectool.ids2npy(codec_ids[::2]))
+        instrumentals.append(codectool.ids2npy(codec_ids[1::2]))
+    return np.concatenate(vocals, axis=1), np.concatenate(instrumentals, axis=1)
+def save_audio(wav, path, sr):
+    wav = wav.clamp(-0.99, 0.99)
+    torchaudio.save(path, wav.cpu(), sr, encoding='PCM_S', bits_per_sample=16)
+# --------------------------
+# Gradio Interface
+# --------------------------
+@spaces.GPU(duration=120)
+def infer(genre, lyrics, num_segments=2, max_tokens=2000):
+    with tempfile.TemporaryDirectory() as tmpdir:
+        return generate_music(genre, lyrics, num_segments, max_tokens)
+# Initialize models at startup
+preload_models()
+# Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("# YuE Music Generator with KV Cache Optimization")
+    with gr.Row():
+        with gr.Column():
+            genre_txt = gr.Textbox(label="Genre", placeholder="e.g., pop electronic female vocal")
+            lyrics_txt = gr.Textbox(label="Lyrics", lines=8,
+                                  placeholder="""[verse]\nYour lyrics here...""")
+            num_segments = gr.Slider(1, 10, value=2, label="Song Segments")
+            max_tokens = gr.Slider(100, 3000, value=1000, step=100,
+                                 label="Max Tokens per Segment (100≈1sec)")
+            submit_btn = gr.Button("Generate Music")
+        with gr.Column():
+            audio_output = gr.Audio(label="Generated Music", interactive=False)
+    gr.Examples(
+        examples=[
+            ["pop rock male vocal", "[verse]\nStanding in the light..."],
+            ["electronic dance synth female", "[drop]\nFeel the rhythm..."]
+        ],
+        inputs=[genre_txt, lyrics_txt],
+        outputs=audio_output
+    )
     submit_btn.click(
+        fn=infer,
+        inputs=[genre_txt, lyrics_txt, num_segments, max_tokens],
+        outputs=audio_output
     )
+demo.queue(concurrency_count=2).launch()