YuE-music-generator-demo-zero

Paused

App Files Files Community

KingNish commited on Jan 31

Commit

24d1064

verified ·

1 Parent(s): 9ad37ed

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -279

app.py CHANGED Viewed

@@ -67,206 +67,44 @@ import time
 import copy
 from collections import Counter
 from models.soundstream_hubert_new import SoundStream
-from vocoder import build_codec_model, process_audio
-from post_process_audio import replace_low_freq_with_energy_matched
 device = "cuda:0"
-stage2_model = "m-a-p/YuE-s2-1B-general"
-model_stage2 = AutoModelForCausalLM.from_pretrained(
-    stage2_model,
-    torch_dtype=torch.float16,
-    attn_implementation="flash_attention_2"
-    ).to(device)
-model_stage2.eval()
 model = AutoModelForCausalLM.from_pretrained(
     "m-a-p/YuE-s1-7B-anneal-en-cot",
     torch_dtype=torch.float16,
     attn_implementation="flash_attention_2",
 ).to(device)
 model.eval()
 basic_model_config = './xcodec_mini_infer/final_ckpt/config.yaml'
 resume_path = './xcodec_mini_infer/final_ckpt/ckpt_00360000.pth'
-config_path = './xcodec_mini_infer/decoders/config.yaml'
-vocal_decoder_path = './xcodec_mini_infer/decoders/decoder_131000.pth'
-inst_decoder_path = './xcodec_mini_infer/decoders/decoder_151000.pth'
 mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
 codectool = CodecManipulator("xcodec", 0, 1)
-codectool_stage2 = CodecManipulator("xcodec", 0, 8)
 model_config = OmegaConf.load(basic_model_config)
 # Load codec model
 codec_model = eval(model_config.generator.name)(**model_config.generator.config).to(device)
 parameter_dict = torch.load(resume_path, map_location='cpu')
 codec_model.load_state_dict(parameter_dict['codec_model'])
 codec_model.eval()
-# Preload and compile vocoders
-vocal_decoder, inst_decoder = build_codec_model(config_path, vocal_decoder_path, inst_decoder_path)
-vocal_decoder.to(device)
-inst_decoder.to(device)
-vocal_decoder.eval()
-inst_decoder.eval()
-class BlockTokenRangeProcessor(LogitsProcessor):
-    def __init__(self, start_id, end_id):
-        self.blocked_token_ids = list(range(start_id, end_id))
-    def __call__(self, input_ids, scores):
-        scores[:, self.blocked_token_ids] = -float("inf")
-        return scores
-def load_audio_mono(filepath, sampling_rate=16000):
-    audio, sr = torchaudio.load(filepath)
-    # Convert to mono
-    audio = torch.mean(audio, dim=0, keepdim=True)
-    # Resample if needed
-    if sr != sampling_rate:
-        resampler = Resample(orig_freq=sr, new_freq=sampling_rate)
-        audio = resampler(audio)
-    return audio
-def split_lyrics(lyrics: str):
-    pattern = r"\[(\w+)\](.*?)\n(?=\[|\Z)"
-    segments = re.findall(pattern, lyrics, re.DOTALL)
-    structured_lyrics = [f"[{seg[0]}]\n{seg[1].strip()}\n\n" for seg in segments]
-    return structured_lyrics
-def stage2_generate(model, prompt, batch_size=1): # set batch_size=1 for gradio demo
-    codec_ids = codectool.unflatten(prompt, n_quantizer=1)
-    codec_ids = codectool.offset_tok_ids(
-                    codec_ids,
-                    global_offset=codectool.global_offset,
-                    codebook_size=codectool.codebook_size,
-                    num_codebooks=codectool.num_codebooks,
-                ).astype(np.int32)
-    # Prepare prompt_ids based on batch size or single input
-    if batch_size > 1:
-        codec_list = []
-        for i in range(batch_size):
-            idx_begin = i * 300
-            idx_end = (i + 1) * 300
-            codec_list.append(codec_ids[:, idx_begin:idx_end])
-        codec_ids = np.concatenate(codec_list, axis=0)
-        prompt_ids = np.concatenate(
-            [
-                np.tile([mmtokenizer.soa, mmtokenizer.stage_1], (batch_size, 1)),
-                codec_ids,
-                np.tile([mmtokenizer.stage_2], (batch_size, 1)),
-            ],
-            axis=1
-        )
-    else:
-        prompt_ids = np.concatenate([
-            np.array([mmtokenizer.soa, mmtokenizer.stage_1]),
-            codec_ids.flatten(),  # Flatten the 2D array to 1D
-            np.array([mmtokenizer.stage_2])
-        ]).astype(np.int32)
-        prompt_ids = prompt_ids[np.newaxis, ...]
-    codec_ids = torch.as_tensor(codec_ids).to(device)
-    prompt_ids = torch.as_tensor(prompt_ids).to(device)
-    len_prompt = prompt_ids.shape[-1]
-    block_list = LogitsProcessorList([BlockTokenRangeProcessor(0, 46358), BlockTokenRangeProcessor(53526, mmtokenizer.vocab_size)])
-    # Teacher forcing generate loop
-    for frames_idx in range(codec_ids.shape[1]):
-        cb0 = codec_ids[:, frames_idx:frames_idx+1]
-        prompt_ids = torch.cat([prompt_ids, cb0], dim=1)
-        input_ids = prompt_ids
-        with torch.no_grad():
-            stage2_output = model.generate(input_ids=input_ids,
-                min_new_tokens=7,
-                max_new_tokens=7,
-                eos_token_id=mmtokenizer.eoa,
-                pad_token_id=mmtokenizer.eoa,
-                logits_processor=block_list,
-            )
-        assert stage2_output.shape[1] - prompt_ids.shape[1] == 7, f"output new tokens={stage2_output.shape[1]-prompt_ids.shape[1]}"
-        prompt_ids = stage2_output
-    # Return output based on batch size
-    if batch_size > 1:
-        output = prompt_ids.cpu().numpy()[:, len_prompt:]
-        output_list = [output[i] for i in range(batch_size)]
-        output = np.concatenate(output_list, axis=0)
-    else:
-        output = prompt_ids[0].cpu().numpy()[len_prompt:]
-    return output
-def stage2_inference(model, stage1_output_set, stage2_output_dir, batch_size=1): # set batch_size=1 for gradio demo
-    stage2_result = []
-    for i in tqdm(range(len(stage1_output_set))):
-        output_filename = os.path.join(stage2_output_dir, os.path.basename(stage1_output_set[i]))
-        if os.path.exists(output_filename):
-            print(f'{output_filename} stage2 has done.')
-            continue
-        # Load the prompt
-        prompt = np.load(stage1_output_set[i]).astype(np.int32)
-        # Only accept 6s segments
-        output_duration = prompt.shape[-1] // 50 // 6 * 6
-        num_batch = output_duration // 6
-        if output_duration <= 0:
-            print(f'{output_filename} stage1 output is too short, skipping stage2.')
-            continue
-        if num_batch <= batch_size:
-            # If num_batch is less than or equal to batch_size, we can infer the entire prompt at once
-            output = stage2_generate(model, prompt[:, :output_duration*50], batch_size=num_batch)
-        else:
-            # If num_batch is greater than batch_size, process in chunks of batch_size
-            segments = []
-            num_segments = (num_batch // batch_size) + (1 if num_batch % batch_size != 0 else 0)
-            for seg in range(num_segments):
-                start_idx = seg * batch_size * 300
-                # Ensure the end_idx does not exceed the available length
-                end_idx = min((seg + 1) * batch_size * 300, output_duration*50)  # Adjust the last segment
-                current_batch_size = batch_size if seg != num_segments-1 or num_batch % batch_size == 0 else num_batch % batch_size
-                segment = stage2_generate(
-                    model,
-                    prompt[:, start_idx:end_idx],
-                    batch_size=current_batch_size
-                )
-                segments.append(segment)
-            # Concatenate all the segments
-            output = np.concatenate(segments, axis=0)
-        # Process the ending part of the prompt
-        if output_duration*50 != prompt.shape[-1]:
-            ending = stage2_generate(model, prompt[:, output_duration*50:], batch_size=1)
-            output = np.concatenate([output, ending], axis=0)
-        output = codectool_stage2.ids2npy(output)
-        # Fix invalid codes (a dirty solution, which may harm the quality of audio)
-        # We are trying to find better one
-        fixed_output = copy.deepcopy(output)
-        for i, line in enumerate(output):
-            for j, element in enumerate(line):
-                if element < 0 or element > 1023:
-                    counter = Counter(line)
-                    most_frequant = sorted(counter.items(), key=lambda x: x[1], reverse=True)[0][0]
-                    fixed_output[i, j] = most_frequant
-        # save output
-        np.save(output_filename, fixed_output)
-        stage2_result.append(output_filename)
-    return stage2_result
 @spaces.GPU(duration=120)
@@ -289,10 +127,33 @@ def generate_music(
     with tempfile.TemporaryDirectory() as output_dir:
         stage1_output_dir = os.path.join(output_dir, f"stage1")
-        stage2_output_dir = stage1_output_dir.replace('stage1', 'stage2')
         os.makedirs(stage1_output_dir, exist_ok=True)
-        os.makedirs(stage2_output_dir, exist_ok=True)
         stage1_output_set = []
         genres = genre_txt.strip()
@@ -407,10 +268,6 @@ def generate_music(
         stage1_output_set.append(vocal_save_path)
         stage1_output_set.append(inst_save_path)
-        print("Stage 2 inference...")
-        stage2_result = stage2_inference(model_stage2, stage1_output_set, stage2_output_dir, batch_size=1) # set batch_size=1 for gradio demo
-        print('Stage 2 DONE.\n')
         print("Converting to Audio...")
         # convert audio tokens to audio
@@ -423,14 +280,14 @@ def generate_music(
             wav = wav * min(limit / max_val, 1) if rescale else wav.clamp(-limit, limit)
             torchaudio.save(str(path), wav, sample_rate=sample_rate, encoding='PCM_S', bits_per_sample=16)
-        # reconstruct tracks from stage 1
-        recons_output_dir = os.path.join(output_dir, "recons_stage1") # changed folder name to recons_stage1
         recons_mix_dir = os.path.join(recons_output_dir, 'mix')
         os.makedirs(recons_mix_dir, exist_ok=True)
-        tracks_stage1 = [] # changed variable name to tracks_stage1
         for npy in stage1_output_set:
             codec_result = np.load(npy)
-            decodec_rlt=[]
             with torch.no_grad():
                 decoded_waveform = codec_model.decode(
                     torch.as_tensor(codec_result.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(
@@ -438,61 +295,11 @@ def generate_music(
             decoded_waveform = decoded_waveform.cpu().squeeze(0)
             decodec_rlt.append(torch.as_tensor(decoded_waveform))
             decodec_rlt = torch.cat(decodec_rlt, dim=-1)
-            save_path = os.path.join(recons_output_dir, os.path.splitext(os.path.basename(npy))[0] + "_stage1.mp3") # changed filename to include _stage1
-            tracks_stage1.append(save_path) # changed variable name to tracks_stage1
             save_audio(decodec_rlt, save_path, 16000)
-        # reconstruct tracks from stage 2 and vocoder
-        recons_output_dir = os.path.join(output_dir, "recons_stage2_vocoder") # changed folder name to recons_stage2_vocoder
-        recons_mix_dir = os.path.join(recons_output_dir, 'mix')
-        os.makedirs(recons_mix_dir, exist_ok=True)
-        tracks_stage2_vocoder = [] # changed variable name to tracks_stage2_vocoder
-        vocoder_stems_dir = os.path.join(recons_output_dir, 'stems') # vocoder output stems in recons_stage2_vocoder
-        os.makedirs(vocoder_stems_dir, exist_ok=True)
-        vocal_output = None # initialize for mix error handling
-        instrumental_output = None # initialize for mix error handling
-        for npy in stage2_result:
-            if 'instrumental' in npy:
-                # Process instrumental
-                instrumental_output = process_audio(
-                    npy,
-                    os.path.join(vocoder_stems_dir, 'instrumental.mp3'), # vocoder output to vocoder_stems_dir
-                    rescale,
-                    None, # Removed args, use default vocoder args
-                    inst_decoder,
-                    codec_model
-                )
-            else:
-                # Process vocal
-                vocal_output = process_audio(
-                    npy,
-                    os.path.join(vocoder_stems_dir, 'vocal.mp3'), # vocoder output to vocoder_stems_dir
-                    rescale,
-                    None, # Removed args, use default vocoder args
-                    vocal_decoder,
-                    codec_model
-                )
-        # mix tracks from vocoder output
-        try:
-            mix_output = instrumental_output + vocal_output
-            vocoder_mix = os.path.join(recons_mix_dir, 'mixed_stage2_vocoder.mp3') # mixed output in recons_stage2_vocoder, changed filename
-            save_audio(mix_output, vocoder_mix, 44100, rescale)
-            print(f"Created mix: {vocoder_mix}")
-            tracks_stage2_vocoder.append(vocoder_mix) # add mixed vocoder output path
-        except RuntimeError as e:
-            print(e)
-            vocoder_mix = None # set to None if mix failed
-            print(f"mix {vocoder_mix} failed! inst: {instrumental_output.shape if instrumental_output is not None else 'None'}, vocal: {vocal_output.shape if vocal_output is not None else 'None'}")
-        # mix tracks from stage 1
-        mixed_stage1_path = None
-        vocal_stage1_path = None
-        instrumental_stage1_path = None
-        for inst_path in tracks_stage1: # changed variable name to tracks_stage1
             try:
                 if (inst_path.endswith('.wav') or inst_path.endswith('.mp3')) \
                         and 'instrumental' in inst_path:
@@ -501,45 +308,17 @@ def generate_music(
                     if not os.path.exists(vocal_path):
                         continue
                     # mix
-                    recons_mix = os.path.join(recons_mix_dir, os.path.basename(inst_path).replace('instrumental_stage1', 'mixed_stage1')) # changed mixed filename
-                    vocal_stem, sr = sf.read(vocal_path)
-                    instrumental_stem, _ = sf.read(inst_path)
                     mix_stem = (vocal_stem + instrumental_stem) / 1
-                    sf.write(recons_mix, mix_stem, sr)
-                    mixed_stage1_path = recons_mix # store mixed stage 1 path
-                    vocal_stage1_path = vocal_path # store vocal stage 1 path
-                    instrumental_stage1_path = inst_path # store instrumental stage 1 path
             except Exception as e:
                 print(e)
-        # Post process - skip post process for gradio to simplify.
-        # recons_mix_final_path = os.path.join(output_dir, os.path.basename(mixed_stage1_path).replace('_stage1', '_final')) # final output path
-        # replace_low_freq_with_energy_matched(
-        #     a_file=mixed_stage1_path,     # 16kHz
-        #     b_file=vocoder_mix,     # 48kHz
-        #     c_file=recons_mix_final_path,
-        #     cutoff_freq=5500.0
-        # )
-        if vocoder_mix is not None: # return vocoder mix if successful
-            mixed_audio_data, sr_vocoder_mix = sf.read(vocoder_mix)
-            vocal_audio_data = None # stage 2 vocoder stems are not mixed and returned in this demo, set to None
-            instrumental_audio_data = None # stage 2 vocoder stems are not mixed and returned in this demo, set to None
-            return (sr_vocoder_mix, (mixed_audio_data * 32767).astype(np.int16)), vocal_audio_data, instrumental_audio_data
-        elif mixed_stage1_path is not None: # if vocoder failed, return stage 1 mix
-            mixed_audio_data_stage1, sr_stage1_mix = sf.read(mixed_stage1_path)
-            vocal_audio_data_stage1, sr_vocal_stage1 = sf.read(vocal_stage1_path)
-            instrumental_audio_data_stage1, sr_inst_stage1 = sf.read(instrumental_stage1_path)
-            return (sr_stage1_mix, (mixed_audio_data_stage1 * 32767).astype(np.int16)), (sr_vocal_stage1, (vocal_audio_data_stage1 * 32767).astype(np.int16)), (sr_inst_stage1, (instrumental_audio_data_stage1 * 32767).astype(np.int16))
-        else: # if both failed, return None
-             return None, None, None
-def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=5):
     # Execute the command
     try:
         mixed_audio_data, vocal_audio_data, instrumental_audio_data = generate_music(genre_txt=genre_txt_content, lyrics_txt=lyrics_txt_content, run_n_segments=num_segments,
@@ -579,10 +358,10 @@ with gr.Blocks() as demo:
                 max_new_tokens = gr.Slider(label="Duration of song", minimum=1, maximum=30, step=1, value=15, interactive=True)
                 submit_btn = gr.Button("Submit")
-                music_out = gr.Audio(label="Mixed Audio Result (Stage 2 + Vocoder)")
-                with gr.Accordion(label="Stage 1 Vocal and Instrumental Result", open=False):
-                    vocal_out = gr.Audio(label="Vocal Audio (Stage 1)")
-                    instrumental_out = gr.Audio(label="Instrumental Audio (Stage 1)")
         gr.Examples(
             examples=[

 import copy
 from collections import Counter
 from models.soundstream_hubert_new import SoundStream
+#from vocoder import build_codec_model, process_audio # removed vocoder
+#from post_process_audio import replace_low_freq_with_energy_matched # removed post process
 device = "cuda:0"
 model = AutoModelForCausalLM.from_pretrained(
     "m-a-p/YuE-s1-7B-anneal-en-cot",
     torch_dtype=torch.float16,
     attn_implementation="flash_attention_2",
+    # low_cpu_mem_usage=True,
 ).to(device)
 model.eval()
 basic_model_config = './xcodec_mini_infer/final_ckpt/config.yaml'
 resume_path = './xcodec_mini_infer/final_ckpt/ckpt_00360000.pth'
+#config_path = './xcodec_mini_infer/decoders/config.yaml' # removed vocoder
+#vocal_decoder_path = './xcodec_mini_infer/decoders/decoder_131000.pth' # removed vocoder
+#inst_decoder_path = './xcodec_mini_infer/decoders/decoder_151000.pth' # removed vocoder
 mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
 codectool = CodecManipulator("xcodec", 0, 1)
 model_config = OmegaConf.load(basic_model_config)
 # Load codec model
 codec_model = eval(model_config.generator.name)(**model_config.generator.config).to(device)
 parameter_dict = torch.load(resume_path, map_location='cpu')
 codec_model.load_state_dict(parameter_dict['codec_model'])
+# codec_model = torch.compile(codec_model)
 codec_model.eval()
+# Preload and compile vocoders # removed vocoder
+#vocal_decoder, inst_decoder = build_codec_model(config_path, vocal_decoder_path, inst_decoder_path)
+#vocal_decoder.to(device)
+#inst_decoder.to(device)
+#vocal_decoder = torch.compile(vocal_decoder)
+#inst_decoder = torch.compile(inst_decoder)
+#vocal_decoder.eval()
+#inst_decoder.eval()
 @spaces.GPU(duration=120)
     with tempfile.TemporaryDirectory() as output_dir:
         stage1_output_dir = os.path.join(output_dir, f"stage1")
         os.makedirs(stage1_output_dir, exist_ok=True)
+        class BlockTokenRangeProcessor(LogitsProcessor):
+            def __init__(self, start_id, end_id):
+                self.blocked_token_ids = list(range(start_id, end_id))
+            def __call__(self, input_ids, scores):
+                scores[:, self.blocked_token_ids] = -float("inf")
+                return scores
+        def load_audio_mono(filepath, sampling_rate=16000):
+            audio, sr = torchaudio.load(filepath)
+            # Convert to mono
+            audio = torch.mean(audio, dim=0, keepdim=True)
+            # Resample if needed
+            if sr != sampling_rate:
+                resampler = Resample(orig_freq=sr, new_freq=sampling_rate)
+                audio = resampler(audio)
+            return audio
+        def split_lyrics(lyrics: str):
+            pattern = r"\[(\w+)\](.*?)\n(?=\[|\Z)"
+            segments = re.findall(pattern, lyrics, re.DOTALL)
+            structured_lyrics = [f"[{seg[0]}]\n{seg[1].strip()}\n\n" for seg in segments]
+            return structured_lyrics
+        # Call the function and print the result
         stage1_output_set = []
         genres = genre_txt.strip()
         stage1_output_set.append(vocal_save_path)
         stage1_output_set.append(inst_save_path)
         print("Converting to Audio...")
         # convert audio tokens to audio
             wav = wav * min(limit / max_val, 1) if rescale else wav.clamp(-limit, limit)
             torchaudio.save(str(path), wav, sample_rate=sample_rate, encoding='PCM_S', bits_per_sample=16)
+        # reconstruct tracks
+        recons_output_dir = os.path.join(output_dir, "recons")
         recons_mix_dir = os.path.join(recons_output_dir, 'mix')
         os.makedirs(recons_mix_dir, exist_ok=True)
+        tracks = []
         for npy in stage1_output_set:
             codec_result = np.load(npy)
+            decodec_rlt = []
             with torch.no_grad():
                 decoded_waveform = codec_model.decode(
                     torch.as_tensor(codec_result.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(
             decoded_waveform = decoded_waveform.cpu().squeeze(0)
             decodec_rlt.append(torch.as_tensor(decoded_waveform))
             decodec_rlt = torch.cat(decodec_rlt, dim=-1)
+            save_path = os.path.join(recons_output_dir, os.path.splitext(os.path.basename(npy))[0] + ".mp3")
+            tracks.append(save_path)
             save_audio(decodec_rlt, save_path, 16000)
+        # mix tracks
+        for inst_path in tracks:
             try:
                 if (inst_path.endswith('.wav') or inst_path.endswith('.mp3')) \
                         and 'instrumental' in inst_path:
                     if not os.path.exists(vocal_path):
                         continue
                     # mix
+                    recons_mix = os.path.join(recons_mix_dir, os.path.basename(inst_path).replace('instrumental', 'mixed'))
+                    vocal_stem, sr = sf.read(inst_path)
+                    instrumental_stem, _ = sf.read(vocal_path)
                     mix_stem = (vocal_stem + instrumental_stem) / 1
+                    return (sr, (mix_stem * 32767).astype(np.int16)), (sr, (vocal_stem * 32767).astype(np.int16)), (sr, (instrumental_stem * 32767).astype(np.int16))
             except Exception as e:
                 print(e)
+                return None, None, None
+def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=15):
     # Execute the command
     try:
         mixed_audio_data, vocal_audio_data, instrumental_audio_data = generate_music(genre_txt=genre_txt_content, lyrics_txt=lyrics_txt_content, run_n_segments=num_segments,
                 max_new_tokens = gr.Slider(label="Duration of song", minimum=1, maximum=30, step=1, value=15, interactive=True)
                 submit_btn = gr.Button("Submit")
+                music_out = gr.Audio(label="Mixed Audio Result")
+                with gr.Accordion(label="Vocal and Instrumental Result", open=False):
+                    vocal_out = gr.Audio(label="Vocal Audio")
+                    instrumental_out = gr.Audio(label="Instrumental Audio")
         gr.Examples(
             examples=[