YuE-music-generator-demo-zero

Paused

App Files Files Community

KingNish commited on Jan 31

Commit

d758bba

verified ·

1 Parent(s): 44d4a2f

adding stage 2 again as in only stage 1 vocal quality is very bad.

Browse files

Files changed (1) hide show

app.py +208 -17

app.py CHANGED Viewed

@@ -46,7 +46,6 @@ except FileNotFoundError:
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer'))
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer', 'descriptaudiocodec'))
 # don't change above code
 import argparse
@@ -67,11 +66,12 @@ import time
 import copy
 from collections import Counter
 from models.soundstream_hubert_new import SoundStream
-#from vocoder import build_codec_model, process_audio # removed vocoder
-#from post_process_audio import replace_low_freq_with_energy_matched # removed post process
 device = "cuda:0"
 model = AutoModelForCausalLM.from_pretrained(
     "m-a-p/YuE-s1-7B-anneal-en-cot",
     torch_dtype=torch.float16,
@@ -80,15 +80,26 @@ model = AutoModelForCausalLM.from_pretrained(
 ).to(device)
 model.eval()
 basic_model_config = './xcodec_mini_infer/final_ckpt/config.yaml'
 resume_path = './xcodec_mini_infer/final_ckpt/ckpt_00360000.pth'
-#config_path = './xcodec_mini_infer/decoders/config.yaml' # removed vocoder
-#vocal_decoder_path = './xcodec_mini_infer/decoders/decoder_131000.pth' # removed vocoder
-#inst_decoder_path = './xcodec_mini_infer/decoders/decoder_151000.pth' # removed vocoder
 mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
 codectool = CodecManipulator("xcodec", 0, 1)
 model_config = OmegaConf.load(basic_model_config)
 # Load codec model
 codec_model = eval(model_config.generator.name)(**model_config.generator.config).to(device)
@@ -97,15 +108,14 @@ codec_model.load_state_dict(parameter_dict['codec_model'])
 # codec_model = torch.compile(codec_model)
 codec_model.eval()
-# Preload and compile vocoders # removed vocoder
-#vocal_decoder, inst_decoder = build_codec_model(config_path, vocal_decoder_path, inst_decoder_path)
-#vocal_decoder.to(device)
-#inst_decoder.to(device)
 #vocal_decoder = torch.compile(vocal_decoder)
 #inst_decoder = torch.compile(inst_decoder)
-#vocal_decoder.eval()
-#inst_decoder.eval()
 @spaces.GPU(duration=120)
 def generate_music(
@@ -127,7 +137,9 @@ def generate_music(
     with tempfile.TemporaryDirectory() as output_dir:
         stage1_output_dir = os.path.join(output_dir, f"stage1")
         os.makedirs(stage1_output_dir, exist_ok=True)
         class BlockTokenRangeProcessor(LogitsProcessor):
             def __init__(self, start_id, end_id):
@@ -268,6 +280,138 @@ def generate_music(
         stage1_output_set.append(vocal_save_path)
         stage1_output_set.append(inst_save_path)
         print("Converting to Audio...")
         # convert audio tokens to audio
@@ -285,7 +429,7 @@ def generate_music(
         recons_mix_dir = os.path.join(recons_output_dir, 'mix')
         os.makedirs(recons_mix_dir, exist_ok=True)
         tracks = []
-        for npy in stage1_output_set:
             codec_result = np.load(npy)
             decodec_rlt = []
             with torch.no_grad():
@@ -312,11 +456,59 @@ def generate_music(
                     vocal_stem, sr = sf.read(inst_path)
                     instrumental_stem, _ = sf.read(vocal_path)
                     mix_stem = (vocal_stem + instrumental_stem) / 1
-                    return (sr, (mix_stem * 32767).astype(np.int16)), (sr, (vocal_stem * 32767).astype(np.int16)), (sr, (instrumental_stem * 32767).astype(np.int16))
             except Exception as e:
                 print(e)
-                return None, None, None
 def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=15):
     # Execute the command
@@ -330,7 +522,6 @@ def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=
     finally:
         print("Temporary files deleted.")
 # Gradio
 with gr.Blocks() as demo:
     with gr.Column():

 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer'))
 sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer', 'descriptaudiocodec'))
 # don't change above code
 import argparse
 import copy
 from collections import Counter
 from models.soundstream_hubert_new import SoundStream
+from vocoder import build_codec_model, process_audio # added vocoder back
+from post_process_audio import replace_low_freq_with_energy_matched # added post process back
 device = "cuda:0"
+# Stage 1 model
 model = AutoModelForCausalLM.from_pretrained(
     "m-a-p/YuE-s1-7B-anneal-en-cot",
     torch_dtype=torch.float16,
 ).to(device)
 model.eval()
+# Stage 2 model
+stage2_model_path = "m-a-p/YuE-s2-1B-general"
+model_stage2 = AutoModelForCausalLM.from_pretrained(
+    stage2_model_path,
+    torch_dtype=torch.float16,
+    attn_implementation="flash_attention_2"
+    )
+model_stage2.to(device)
+model_stage2.eval()
 basic_model_config = './xcodec_mini_infer/final_ckpt/config.yaml'
 resume_path = './xcodec_mini_infer/final_ckpt/ckpt_00360000.pth'
+config_path = './xcodec_mini_infer/decoders/config.yaml' # added vocoder
+vocal_decoder_path = './xcodec_mini_infer/decoders/decoder_131000.pth' # added vocoder
+inst_decoder_path = './xcodec_mini_infer/decoders/decoder_151000.pth' # added vocoder
 mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
 codectool = CodecManipulator("xcodec", 0, 1)
+codectool_stage2 = CodecManipulator("xcodec", 0, 8)
 model_config = OmegaConf.load(basic_model_config)
 # Load codec model
 codec_model = eval(model_config.generator.name)(**model_config.generator.config).to(device)
 # codec_model = torch.compile(codec_model)
 codec_model.eval()
+# Preload and compile vocoders # added vocoder
+vocal_decoder, inst_decoder = build_codec_model(config_path, vocal_decoder_path, inst_decoder_path)
+vocal_decoder.to(device)
+inst_decoder.to(device)
 #vocal_decoder = torch.compile(vocal_decoder)
 #inst_decoder = torch.compile(inst_decoder)
+vocal_decoder.eval()
+inst_decoder.eval()
 @spaces.GPU(duration=120)
 def generate_music(
     with tempfile.TemporaryDirectory() as output_dir:
         stage1_output_dir = os.path.join(output_dir, f"stage1")
+        stage2_output_dir = stage1_output_dir.replace('stage1', 'stage2')
         os.makedirs(stage1_output_dir, exist_ok=True)
+        os.makedirs(stage2_output_dir, exist_ok=True)
         class BlockTokenRangeProcessor(LogitsProcessor):
             def __init__(self, start_id, end_id):
         stage1_output_set.append(vocal_save_path)
         stage1_output_set.append(inst_save_path)
+        print("Stage 2 inference...") # stage 2 inference
+        def stage2_generate(model, prompt, batch_size=16):
+            codec_ids = codectool.unflatten(prompt, n_quantizer=1)
+            codec_ids = codectool.offset_tok_ids(
+                            codec_ids,
+                            global_offset=codectool.global_offset,
+                            codebook_size=codectool.codebook_size,
+                            num_codebooks=codectool.num_codebooks,
+                        ).astype(np.int32)
+            # Prepare prompt_ids based on batch size or single input
+            if batch_size > 1:
+                codec_list = []
+                for i in range(batch_size):
+                    idx_begin = i * 300
+                    idx_end = (i + 1) * 300
+                    codec_list.append(codec_ids[:, idx_begin:idx_end])
+                codec_ids = np.concatenate(codec_list, axis=0)
+                prompt_ids = np.concatenate(
+                    [
+                        np.tile([mmtokenizer.soa, mmtokenizer.stage_1], (batch_size, 1)),
+                        codec_ids,
+                        np.tile([mmtokenizer.stage_2], (batch_size, 1)),
+                    ],
+                    axis=1
+                )
+            else:
+                prompt_ids = np.concatenate([
+                    np.array([mmtokenizer.soa, mmtokenizer.stage_1]),
+                    codec_ids.flatten(),  # Flatten the 2D array to 1D
+                    np.array([mmtokenizer.stage_2])
+                ]).astype(np.int32)
+                prompt_ids = prompt_ids[np.newaxis, ...]
+            codec_ids = torch.as_tensor(codec_ids).to(device)
+            prompt_ids = torch.as_tensor(prompt_ids).to(device)
+            len_prompt = prompt_ids.shape[-1]
+            block_list = LogitsProcessorList([BlockTokenRangeProcessor(0, 46358), BlockTokenRangeProcessor(53526, mmtokenizer.vocab_size)])
+            # Teacher forcing generate loop
+            for frames_idx in range(codec_ids.shape[1]):
+                cb0 = codec_ids[:, frames_idx:frames_idx+1]
+                prompt_ids = torch.cat([prompt_ids, cb0], dim=1)
+                input_ids = prompt_ids
+                with torch.no_grad():
+                    stage2_output = model.generate(input_ids=input_ids,
+                        min_new_tokens=7,
+                        max_new_tokens=7,
+                        eos_token_id=mmtokenizer.eoa,
+                        pad_token_id=mmtokenizer.eoa,
+                        logits_processor=block_list,
+                    )
+                assert stage2_output.shape[1] - prompt_ids.shape[1] == 7, f"output new tokens={stage2_output.shape[1]-prompt_ids.shape[1]}"
+                prompt_ids = stage2_output
+            # Return output based on batch size
+            if batch_size > 1:
+                output = prompt_ids.cpu().numpy()[:, len_prompt:]
+                output_list = [output[i] for i in range(batch_size)]
+                output = np.concatenate(output_list, axis=0)
+            else:
+                output = prompt_ids[0].cpu().numpy()[len_prompt:]
+            return output
+        def stage2_inference(model, stage1_output_set, stage2_output_dir, batch_size=4):
+            stage2_result = []
+            for i in tqdm(range(len(stage1_output_set))):
+                output_filename = os.path.join(stage2_output_dir, os.path.basename(stage1_output_set[i]))
+                if os.path.exists(output_filename):
+                    print(f'{output_filename} stage2 has done.')
+                    continue
+                # Load the prompt
+                prompt = np.load(stage1_output_set[i]).astype(np.int32)
+                # Only accept 6s segments
+                output_duration = prompt.shape[-1] // 50 // 6 * 6
+                num_batch = output_duration // 6
+                if num_batch <= batch_size:
+                    # If num_batch is less than or equal to batch_size, we can infer the entire prompt at once
+                    output = stage2_generate(model, prompt[:, :output_duration*50], batch_size=num_batch)
+                else:
+                    # If num_batch is greater than batch_size, process in chunks of batch_size
+                    segments = []
+                    num_segments = (num_batch // batch_size) + (1 if num_batch % batch_size != 0 else 0)
+                    for seg in range(num_segments):
+                        start_idx = seg * batch_size * 300
+                        # Ensure the end_idx does not exceed the available length
+                        end_idx = min((seg + 1) * batch_size * 300, output_duration*50)  # Adjust the last segment
+                        current_batch_size = batch_size if seg != num_segments-1 or num_batch % batch_size == 0 else num_batch % batch_size
+                        segment = stage2_generate(
+                            model,
+                            prompt[:, start_idx:end_idx],
+                            batch_size=current_batch_size
+                        )
+                        segments.append(segment)
+                    # Concatenate all the segments
+                    output = np.concatenate(segments, axis=0)
+                # Process the ending part of the prompt
+                if output_duration*50 != prompt.shape[-1]:
+                    ending = stage2_generate(model, prompt[:, output_duration*50:], batch_size=1)
+                    output = np.concatenate([output, ending], axis=0)
+                output = codectool_stage2.ids2npy(output)
+                # Fix invalid codes (a dirty solution, which may harm the quality of audio)
+                # We are trying to find better one
+                fixed_output = copy.deepcopy(output)
+                for i, line in enumerate(output):
+                    for j, element in enumerate(line):
+                        if element < 0 or element > 1023:
+                            counter = Counter(line)
+                            most_frequant = sorted(counter.items(), key=lambda x: x[1], reverse=True)[0][0]
+                            fixed_output[i, j] = most_frequant
+                # save output
+                np.save(output_filename, fixed_output)
+                stage2_result.append(output_filename)
+            return stage2_result
+        stage2_result = stage2_inference(model_stage2, stage1_output_set, stage2_output_dir, batch_size=4)
+        print(stage2_result)
+        print('Stage 2 DONE.\n')
         print("Converting to Audio...")
         # convert audio tokens to audio
         recons_mix_dir = os.path.join(recons_output_dir, 'mix')
         os.makedirs(recons_mix_dir, exist_ok=True)
         tracks = []
+        for npy in stage2_result:
             codec_result = np.load(npy)
             decodec_rlt = []
             with torch.no_grad():
                     vocal_stem, sr = sf.read(inst_path)
                     instrumental_stem, _ = sf.read(vocal_path)
                     mix_stem = (vocal_stem + instrumental_stem) / 1
+                    sf.write(recons_mix, mix_stem, sr) # saving 16k mix audio
             except Exception as e:
                 print(e)
+        print("Upsampling audio...")
+        # vocoder to upsample audios
+        vocoder_output_dir = os.path.join(output_dir, 'vocoder')
+        vocoder_stems_dir = os.path.join(vocoder_output_dir, 'stems')
+        vocoder_mix_dir = os.path.join(vocoder_output_dir, 'mix')
+        os.makedirs(vocoder_mix_dir, exist_ok=True)
+        os.makedirs(vocoder_stems_dir, exist_ok=True)
+        for npy in stage2_result:
+            if 'instrumental' in npy:
+                # Process instrumental
+                instrumental_output = process_audio(
+                    npy,
+                    os.path.join(vocoder_stems_dir, 'instrumental.mp3'),
+                    rescale,
+                    None,
+                    inst_decoder,
+                    codec_model
+                )
+            else:
+                # Process vocal
+                vocal_output = process_audio(
+                    npy,
+                    os.path.join(vocoder_stems_dir, 'vocal.mp3'),
+                    rescale,
+                    None,
+                    vocal_decoder,
+                    codec_model
+                )
+        # mix tracks
+        try:
+            mix_output = instrumental_output + vocal_output
+            vocoder_mix = os.path.join(vocoder_mix_dir, os.path.basename(recons_mix))
+            save_audio(mix_output, vocoder_mix, 44100, rescale) # saving 44.1k mix audio
+            print(f"Created mix: {vocoder_mix}")
+        except RuntimeError as e:
+            print(e)
+            print(f"mix {vocoder_mix} failed! inst: {instrumental_output.shape}, vocal: {vocal_output.shape}")
+        # Post process
+        final_mix_path = os.path.join(output_dir, os.path.basename(recons_mix))
+        replace_low_freq_with_energy_matched(
+            a_file=recons_mix,     # 16kHz
+            b_file=vocoder_mix,     # 48kHz
+            c_file=final_mix_path,
+            cutoff_freq=5500.0
+        )
+        # return final mix, upsampled vocal stem, upsampled instrumental stem
+        return (44100, (mix_output.cpu().numpy() * 32767).astype(np.int16)), (44100, (vocal_output.cpu().numpy() * 32767).astype(np.int16)), (44100, (instrumental_output.cpu().numpy() * 32767).astype(np.int16))
 def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=15):
     # Execute the command
     finally:
         print("Temporary files deleted.")
 # Gradio
 with gr.Blocks() as demo:
     with gr.Column():