YuE-music-generator-demo-zero

Running on Zero

App Files Files Community

KingNish commited on Jan 31

Commit

85b4489

verified ·

1 Parent(s): a1a370d

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -74

app.py CHANGED Viewed

@@ -67,8 +67,8 @@ import time
 import copy
 from collections import Counter
 from models.soundstream_hubert_new import SoundStream
-from vocoder import build_codec_model, process_audio
-from post_process_audio import replace_low_freq_with_energy_matched
 device = "cuda:0"
@@ -82,9 +82,9 @@ model.eval()
 basic_model_config = './xcodec_mini_infer/final_ckpt/config.yaml'
 resume_path = './xcodec_mini_infer/final_ckpt/ckpt_00360000.pth'
-config_path = './xcodec_mini_infer/decoders/config.yaml'
-vocal_decoder_path = './xcodec_mini_infer/decoders/decoder_131000.pth'
-inst_decoder_path = './xcodec_mini_infer/decoders/decoder_151000.pth'
 mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
@@ -97,14 +97,15 @@ codec_model.load_state_dict(parameter_dict['codec_model'])
 # codec_model = torch.compile(codec_model)
 codec_model.eval()
-# Preload and compile vocoders - Not using vocoder now
-# vocal_decoder, inst_decoder = build_codec_model(config_path, vocal_decoder_path, inst_decoder_path)
-# vocal_decoder.to(device)
-# inst_decoder.to(device)
-# vocal_decoder = torch.compile(vocal_decoder)
-# inst_decoder = torch.compile(inst_decoder)
-# vocal_decoder.eval()
-# inst_decoder.eval()
 @spaces.GPU(duration=120)
 def generate_music(
@@ -245,8 +246,8 @@ def generate_music(
         if len(soa_idx) != len(eoa_idx):
             raise ValueError(f'invalid pairs of soa and eoa, Num of soa: {len(soa_idx)}, Num of eoa: {len(eoa_idx)}')
-        vocals_codec_results = []
-        instrumentals_codec_results = []
         range_begin = 1 if use_audio_prompt else 0
         for i in range(range_begin, len(soa_idx)):
             codec_ids = ids[soa_idx[i] + 1:eoa_idx[i]]
@@ -254,63 +255,27 @@ def generate_music(
                 codec_ids = codec_ids[1:]
             codec_ids = codec_ids[:2 * (codec_ids.shape[0] // 2)]
             vocals_ids = codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[0])
-            vocals_codec_results.append(vocals_ids)
             instrumentals_ids = codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[1])
-            instrumentals_codec_results.append(instrumentals_ids)
-        vocals_codec_result = np.concatenate(vocals_codec_results, axis=1)
-        instrumentals_codec_result = np.concatenate(instrumentals_codec_results, axis=1)
-        print("Converting to Audio...")
-        # convert audio tokens to audio
-        def save_audio(wav: torch.Tensor, path, sample_rate: int, rescale: bool = False):
-            folder_path = os.path.dirname(path)
-            if not os.path.exists(folder_path):
-                os.makedirs(folder_path)
-            limit = 0.99
-            max_val = wav.abs().max()
-            wav = wav * min(limit / max_val, 1) if rescale else wav.clamp(-limit, limit)
-            torchaudio.save(str(path), wav, sample_rate=sample_rate, encoding='PCM_S', bits_per_sample=16)
-        # reconstruct tracks
-        recons_output_dir = os.path.join(output_dir, "recons")
-        recons_mix_dir = os.path.join(recons_output_dir, 'mix')
-        os.makedirs(recons_mix_dir, exist_ok=True)
-        # Decode vocals
-        with torch.no_grad():
-            decoded_vocals_waveform = codec_model.decode(
-                torch.as_tensor(vocals_codec_result.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(device))
-        decoded_vocals_waveform = decoded_vocals_waveform.cpu().squeeze(0)
-        # Decode instrumentals
         with torch.no_grad():
-            decoded_instrumentals_waveform = codec_model.decode(
-                torch.as_tensor(instrumentals_codec_result.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(device))
-        decoded_instrumentals_waveform = decoded_instrumentals_waveform.cpu().squeeze(0)
-        # Mix tracks
-        mixed_waveform = (decoded_vocals_waveform + decoded_instrumentals_waveform) / 1.0
-        vocal_sr = 16000
-        instrumental_sr = 16000
-        mixed_sr = 16000
-        # added scaling to the audio
-        limit = 0.99
-        max_val = np.max(np.abs(mixed_waveform))
-        mixed_waveform = mixed_waveform * min(limit / max_val, 1)
-        max_val = np.max(np.abs(decoded_vocals_waveform))
-        decoded_vocals_waveform = decoded_vocals_waveform * min(limit/ max_val, 1)
-        max_val = np.max(np.abs(decoded_instrumentals_waveform))
-        decoded_instrumentals_waveform = decoded_instrumentals_waveform * min(limit/max_val,1)
-        print("All process Done")
-        return (mixed_sr, mixed_waveform.numpy()), (vocal_sr, decoded_vocals_waveform.numpy()), (instrumental_sr, decoded_instrumentals_waveform.numpy())
 def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=15):
     # Execute the command
@@ -351,11 +316,11 @@ with gr.Blocks() as demo:
                 num_segments = gr.Number(label="Number of Segments", value=2, interactive=True)
                 max_new_tokens = gr.Slider(label="Duration of song", minimum=1, maximum=30, step=1, value=15, interactive=True)
                 submit_btn = gr.Button("Submit")
-                music_out_mix = gr.Audio(label="Final Audio Result", interactive=False)
-                with gr.Accordion(label="Vocal and Instrumental Result", open=False):
-                    music_out_vocals = gr.Audio(label="Vocal Audio Result", interactive=False)
-                    music_out_instrumental = gr.Audio(label="Instrumental Audio Result", interactive=False)
         gr.Examples(
             examples=[
@@ -401,17 +366,16 @@ Living out my dreams with this mic and a deal
                 ]
             ],
             inputs=[genre_txt, lyrics_txt],
-            outputs=[music_out_mix, music_out_vocals, music_out_instrumental],
             cache_examples=True,
             cache_mode="eager",
             fn=infer
         )
-        gr.Markdown("## We are actively working on improving YuE, and welcome community contributions! Feel free to submit PRs to enhance the model and demo.")
     submit_btn.click(
         fn=infer,
         inputs=[genre_txt, lyrics_txt, num_segments, max_new_tokens],
-        outputs=[music_out_mix, music_out_vocals, music_out_instrumental]
     )
 demo.queue().launch(show_error=True)

 import copy
 from collections import Counter
 from models.soundstream_hubert_new import SoundStream
+#from vocoder import build_codec_model, process_audio # removed vocoder
+#from post_process_audio import replace_low_freq_with_energy_matched # removed post process
 device = "cuda:0"
 basic_model_config = './xcodec_mini_infer/final_ckpt/config.yaml'
 resume_path = './xcodec_mini_infer/final_ckpt/ckpt_00360000.pth'
+#config_path = './xcodec_mini_infer/decoders/config.yaml' # removed vocoder
+#vocal_decoder_path = './xcodec_mini_infer/decoders/decoder_131000.pth' # removed vocoder
+#inst_decoder_path = './xcodec_mini_infer/decoders/decoder_151000.pth' # removed vocoder
 mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
 # codec_model = torch.compile(codec_model)
 codec_model.eval()
+# Preload and compile vocoders # removed vocoder
+#vocal_decoder, inst_decoder = build_codec_model(config_path, vocal_decoder_path, inst_decoder_path)
+#vocal_decoder.to(device)
+#inst_decoder.to(device)
+#vocal_decoder = torch.compile(vocal_decoder)
+#inst_decoder = torch.compile(inst_decoder)
+#vocal_decoder.eval()
+#inst_decoder.eval()
 @spaces.GPU(duration=120)
 def generate_music(
         if len(soa_idx) != len(eoa_idx):
             raise ValueError(f'invalid pairs of soa and eoa, Num of soa: {len(soa_idx)}, Num of eoa: {len(eoa_idx)}')
+        vocals = []
+        instrumentals = []
         range_begin = 1 if use_audio_prompt else 0
         for i in range(range_begin, len(soa_idx)):
             codec_ids = ids[soa_idx[i] + 1:eoa_idx[i]]
                 codec_ids = codec_ids[1:]
             codec_ids = codec_ids[:2 * (codec_ids.shape[0] // 2)]
             vocals_ids = codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[0])
+            vocals.append(vocals_ids)
             instrumentals_ids = codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[1])
+            instrumentals.append(instrumentals_ids)
+        vocals = np.concatenate(vocals, axis=1)
+        instrumentals = np.concatenate(instrumentals, axis=1)
+        #convert audio tokens to audio
         with torch.no_grad():
+            decoded_vocals = codec_model.decode(
+                torch.as_tensor(vocals.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(
+                    device))
+            decoded_instrumentals = codec_model.decode(
+                torch.as_tensor(instrumentals.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(
+                    device))
+        decoded_vocals = decoded_vocals.cpu().squeeze(0)
+        decoded_instrumentals = decoded_instrumentals.cpu().squeeze(0)
+        mixed_audio = (decoded_vocals + decoded_instrumentals)/2
+        return (16000, mixed_audio.numpy()), (16000, decoded_vocals.numpy()), (16000, decoded_instrumentals.numpy())
 def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=15):
     # Execute the command
                 num_segments = gr.Number(label="Number of Segments", value=2, interactive=True)
                 max_new_tokens = gr.Slider(label="Duration of song", minimum=1, maximum=30, step=1, value=15, interactive=True)
                 submit_btn = gr.Button("Submit")
+                music_out = gr.Audio(label="Mixed Audio Result")
+                with gr.Accordion(label="Vocal and Instrumental Result", open=False):
+                    vocal_out = gr.Audio(label="Vocal Audio")
+                    instrumental_out = gr.Audio(label="Instrumental Audio")
         gr.Examples(
             examples=[
                 ]
             ],
             inputs=[genre_txt, lyrics_txt],
+            outputs=[music_out, vocal_out, instrumental_out],
             cache_examples=True,
             cache_mode="eager",
             fn=infer
         )
     submit_btn.click(
         fn=infer,
         inputs=[genre_txt, lyrics_txt, num_segments, max_new_tokens],
+        outputs=[music_out, vocal_out, instrumental_out]
     )
+    gr.Markdown("## Call for Contributions\nIf you find this space interesting please feel free to contribute.")
 demo.queue().launch(show_error=True)